+ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node0.txt + export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 + ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 + mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/0 + DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml + TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/ + CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ + VIT_CKPT_LOAD_DIR=/ + CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// + rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ sending incremental file list sent 71 bytes received 12 bytes 166.00 bytes/sec total size is 23.84K speedup is 287.17 + cd /local_disk/cognitron_vl/ + rm -fr datasets + mkdir -p datasets + ln -s /data/data/ datasets/CV + ln -s /data/data/LLM datasets/LLM + ln -s /data/data/LMM datasets/LMM + source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh ++ source /usr/local/Ascend/driver/bin/setenv.bash +++ DEP_INFO_FILE=/etc/ascend_install.info +++ [[ -f /etc/ascend_install.info ]] +++ . /etc/ascend_install.info +++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common +++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver +++ DRV_LIB64_LDPATH=/driver/lib64 +++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++ source /usr/local/Ascend/ascend-toolkit/set_env.sh +++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ++++ arch +++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: +++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: +++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp +++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp +++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit +++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit +++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest ++ export HCCL_CONNECT_TIMEOUT=7200 ++ HCCL_CONNECT_TIMEOUT=7200 ++ export HCCL_EXEC_TIMEOUT=7200 ++ HCCL_EXEC_TIMEOUT=7200 ++ export COMBINED_ENABLE=1 ++ COMBINED_ENABLE=1 ++ export MULTI_STREAM_MEMORY_REUSE=1 ++ MULTI_STREAM_MEMORY_REUSE=1 ++ export HCCL_RDMA_TC=160 ++ HCCL_RDMA_TC=160 ++ export HCCL_RDMA_SL=5 ++ HCCL_RDMA_SL=5 ++ export HCCL_INTRA_PCIE_ENABLE=0 ++ HCCL_INTRA_PCIE_ENABLE=0 ++ export HCCL_INTRA_ROCE_ENABLE=1 ++ HCCL_INTRA_ROCE_ENABLE=1 ++ export HCCL_RDMA_TIMEOUT=20 ++ HCCL_RDMA_TIMEOUT=20 ++ export INF_NAN_MODE_ENABLE=1 ++ INF_NAN_MODE_ENABLE=1 ++ export DISTRIBUTED_BACKEND=hccl ++ DISTRIBUTED_BACKEND=hccl ++ export ASCEND_LAUNCH_BLOCKING=0 ++ ASCEND_LAUNCH_BLOCKING=0 ++ export ASCEND_SLOG_PRINT_TO_STDOUT=0 ++ ASCEND_SLOG_PRINT_TO_STDOUT=0 ++ export ASCEND_GLOBAL_LOG_LEVEL=3 ++ ASCEND_GLOBAL_LOG_LEVEL=3 ++ export ASCEND_GLOBAL_EVENT_ENABLE=0 ++ ASCEND_GLOBAL_EVENT_ENABLE=0 ++ export TASK_QUEUE_ENABLE=1 ++ TASK_QUEUE_ENABLE=1 ++ export PTCOPY_ENABLE=1 ++ PTCOPY_ENABLE=1 ++ export COMBINED_ENABLE=1 ++ COMBINED_ENABLE=1 ++ export DYNAMIC_OP=ADD#MUL ++ DYNAMIC_OP=ADD#MUL ++ export HCCL_WHITELIST_DISABLE=1 ++ HCCL_WHITELIST_DISABLE=1 ++ export HCCL_CONNECT_TIMEOUT=7200 ++ HCCL_CONNECT_TIMEOUT=7200 ++ export HCCL_WHITELIST_DISABLE=1 ++ HCCL_WHITELIST_DISABLE=1 ++ export CUDA_DEVICE_MAX_CONNECTIONS=1 ++ CUDA_DEVICE_MAX_CONNECTIONS=1 ++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt Looking in links: /data/software/ Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1)) Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0) Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3)) Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0) Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0) Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6)) Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5) Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8)) Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0) Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10)) Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2) Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12)) Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1) Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5) Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15)) Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16) Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17)) Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18)) Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19)) Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20)) Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4) Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2) Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8) Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1) Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu) Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2) Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2) Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3) Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25) Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0) Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1) Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0) Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5) Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1) Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2) Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2) Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1) Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7) Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3) Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6)) Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1) Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15) Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0) Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3) Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3) Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1) Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0) Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1) Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30) Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0) Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15) Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0) Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1) Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3) Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7) Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1) Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18)) Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0) Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1) Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0) Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1) Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5) Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4) Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3) Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0) Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0) Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5) Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18) Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2) Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4) Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1) Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1) Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0) DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets Attempting uninstall: tqdm Found existing installation: tqdm 4.66.2 Uninstalling tqdm-4.66.2: Successfully uninstalled tqdm-4.66.2 Attempting uninstall: safetensors Found existing installation: safetensors 0.4.2 Uninstalling safetensors-0.4.2: Successfully uninstalled safetensors-0.4.2 Attempting uninstall: requests Found existing installation: requests 2.31.0 Uninstalling requests-2.31.0: Successfully uninstalled requests-2.31.0 Attempting uninstall: pyarrow Found existing installation: pyarrow 15.0.0 Uninstalling pyarrow-15.0.0: Successfully uninstalled pyarrow-15.0.0 Attempting uninstall: huggingface-hub Found existing installation: huggingface-hub 0.20.3 Uninstalling huggingface-hub-0.20.3: Successfully uninstalled huggingface-hub-0.20.3 Attempting uninstall: accelerate Found existing installation: accelerate 0.25.0 Uninstalling accelerate-0.25.0: Successfully uninstalled accelerate-0.25.0 Attempting uninstall: datasets Found existing installation: datasets 2.16.0 Uninstalling datasets-2.16.0: Successfully uninstalled datasets-2.16.0 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed. tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed. tikit 1.8.2.240926 requires future==0.18.2, which is not installed. tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed. tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed. tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed. tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed. tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed. tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed. tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed. tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed. tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed. tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible. tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible. tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible. tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible. tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible. tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible. Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ return 0 + MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ + MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ + MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/ + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0 Installing build dependencies: started Installing build dependencies: finished with status 'done' Checking if build backend supports build_editable: started Checking if build backend supports build_editable: finished with status 'done' Getting requirements to build editable: started Getting requirements to build editable: finished with status 'done' Installing backend dependencies: started Installing backend dependencies: finished with status 'done' Preparing editable metadata (pyproject.toml): started Preparing editable metadata (pyproject.toml): finished with status 'done' Building wheels for collected packages: megatron_core Building editable for megatron_core (pyproject.toml): started Building editable for megatron_core (pyproject.toml): finished with status 'done' Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd Stored in directory: /tmp/pip-ephem-wheel-cache-m7mtfhay/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff Successfully built megatron_core DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: megatron_core Successfully installed megatron_core-0.6.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0 Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA' WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA' DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: mindspeed Running setup.py develop for mindspeed Successfully installed mindspeed-0.6.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/ModelLink Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4) Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1) Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1) Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4) Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1) Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1) Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0) Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0) Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6) Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2) Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0) Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3) Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1) Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2) Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8) Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1) Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu) Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1) Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5) Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2) Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1) Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25) Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3) Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1) Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0) Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7) Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3) Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1) Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15) Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0) Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3) Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0) Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0) Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1) Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5) Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4) Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3) Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0) Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18) Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2) Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1) Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3) Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5) Building wheels for collected packages: transformers_stream_generator Building wheel for transformers_stream_generator (setup.py): started Building wheel for transformers_stream_generator (setup.py): finished with status 'done' Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c Successfully built transformers_stream_generator DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink Attempting uninstall: tokenizers Found existing installation: tokenizers 0.20.3 Uninstalling tokenizers-0.20.3: Successfully uninstalled tokenizers-0.20.3 Attempting uninstall: transformers Found existing installation: transformers 4.46.3 Uninstalling transformers-4.46.3: Successfully uninstalled transformers-4.46.3 Attempting uninstall: peft Found existing installation: peft 0.7.0 Uninstalling peft-0.7.0: Successfully uninstalled peft-0.7.0 Running setup.py develop for modellink Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: + PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: + GPUS_PER_NODE=16 + NNODES=32 + NODE_RANK=0 + MASTER_PORT=34567 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + VISION_SEQ_LENGTH=1025 + IMAGE_TOKEN_LENGTH=256 + IMAGE_SIZE=448 + VISION_MODEL_TYPE=intern_300m + TP=8 + PP=1 + CP=2 + CP_ALGO=megatron_cp_algo + CP_MASK=causal + DISTRIBUTED_ARGS=' --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 ' + GPT_ARGS=' --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint ' + DATA_ARGS=' --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 ' + CKPT_ARGS=' --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// ' + OUTPUT_ARGS=' --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 ' + torchrun --nproc_per_node 16 --nnodes 32 --node_rank 0 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl [2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] [2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] ***************************************** [2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. [2024-11-27 12:44:15,231] torch.distributed.run: [WARNING] ***************************************** /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:299: ImportWarning: ************************************************************************************************************* The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.npu and torch.nn.Module.npu now.. The torch.cuda.DoubleTensor is replaced with torch.npu.FloatTensor cause the double type is not supported now.. The backend in torch.distributed.init_process_group set to hccl now.. The torch.cuda.* and torch.cuda.amp.* are replaced with torch.npu.* and torch.npu.amp.* now.. The device parameters have been replaced with npu in the function below: torch.logspace, torch.randint, torch.hann_window, torch.rand, torch.full_like, torch.ones_like, torch.rand_like, torch.randperm, torch.arange, torch.frombuffer, torch.normal, torch._empty_per_channel_affine_quantized, torch.empty_strided, torch.empty_like, torch.scalar_tensor, torch.tril_indices, torch.bartlett_window, torch.ones, torch.sparse_coo_tensor, torch.randn, torch.kaiser_window, torch.tensor, torch.triu_indices, torch.as_tensor, torch.zeros, torch.randint_like, torch.full, torch.eye, torch._sparse_csr_tensor_unsafe, torch.empty, torch._sparse_coo_tensor_unsafe, torch.blackman_window, torch.zeros_like, torch.range, torch.sparse_csr_tensor, torch.randn_like, torch.from_file, torch._cudnn_init_dropout_state, torch._empty_affine_quantized, torch.linspace, torch.hamming_window, torch.empty_quantized, torch._pin_memory, torch.autocast, torch.load, torch.Generator, torch.set_default_device, torch.Tensor.new_empty, torch.Tensor.new_empty_strided, torch.Tensor.new_full, torch.Tensor.new_ones, torch.Tensor.new_tensor, torch.Tensor.new_zeros, torch.Tensor.to, torch.nn.Module.to, torch.nn.Module.to_empty ************************************************************************************************************* warnings.warn(msg, ImportWarning) /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/contrib/transfer_to_npu.py:260: RuntimeWarning: torch.jit.script and torch.jit.script_method will be disabled by transfer_to_npu, which currently does not support them, if you need to enable them, please do not use transfer_to_npu. warnings.warn(msg, RuntimeWarning) Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp... Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja... Building extension module adaptive_cp... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... [1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o [2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( using world size: 512, data-parallel size: 32, context-parallel size: 2 tensor-model-parallel size: 8, pipeline-model-parallel size: 1 WARNING: Setting args.overlap_p2p_comm to False since non-interleaved schedule does not support overlapping p2p communication accumulate and all-reduce gradients in fp32 for bfloat16 data type. using torch.bfloat16 for parameters ... [INFO] Setting args.use_flash_attn=True since context parallel is enabled. [INFO] Setting args.create_attention_mask_in_dataloader to False since reset_data=False or alibi_without_flash_attn=False or args.tokenizer_padding_side=right ------------------------ ModelLink Arguments ------------------------ accumulate_allreduce_grads_in_fp32 .............. True adam_beta1 ...................................... 0.9 adam_beta2 ...................................... 0.999 adam_eps ........................................ 1e-08 adaptive_cp_dynamic_attn_mask ................... False adaptive_cp_manually_set_mask_list .............. False adaptive_cp_only_reschedule ..................... False adaptive_cp_without_coarse ...................... False adaptive_recompute_device_size .................. -1 adaptive_recompute_device_swap .................. False adaptive_recompute_profiling_step ............... 10 add_bias_linear ................................. False add_class_token ................................. True add_dense_bias .................................. False add_position_embedding .......................... True add_qkv_bias .................................... True add_rmsnorm_offset .............................. False adlr_autoresume ................................. False adlr_autoresume_interval ........................ 1000 apply_layernorm_1p .............................. False apply_query_key_layer_scaling ................... False apply_residual_connection_post_layernorm ........ False apply_rope_fusion ............................... True async_tensor_model_parallel_allreduce ........... False attention_dropout ............................... 0.0 attention_mask_on_cpu ........................... False attention_softmax_in_fp32 ....................... True attn_logit_softcapping .......................... None auto_detect_ckpt_format ......................... False barrier_with_L1_time ............................ True bert_binary_head ................................ True bert_embedder_type .............................. megatron bert_load ....................................... None bf16 ............................................ True bias_dropout_fusion ............................. True bias_gelu_fusion ................................ False bias_swiglu_fusion .............................. True biencoder_projection_dim ........................ 0 biencoder_shared_query_context_model ............ False block_data_path ................................. None check_for_nan_in_loss_and_grad .................. True chunk_size ...................................... 4096 ckpt_fully_parallel_save ........................ False ckpt_step ....................................... None classes_fraction ................................ 1.0 clip_grad ....................................... 1.0 clone_scatter_output_in_embedding ............... True consumed_train_samples .......................... 0 consumed_valid_samples .......................... 0 context_parallel_algo ........................... megatron_cp_algo context_parallel_size ........................... 2 cp_attention_mask_type .......................... causal cp_window_size .................................. 1 create_attention_mask_in_dataloader ............. False cross_dataset_joint ............................. True data_cache_path ................................. None data_parallel_random_init ....................... False data_parallel_size .............................. 32 data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'] data_per_class_fraction ......................... 1.0 data_seq_length ................................. 131072 data_sharding ................................... True dataloader_type ................................. single decoder_num_layers .............................. None decoder_seq_length .............................. None decoupled_lr .................................... None decoupled_min_lr ................................ None delay_grad_reduce ............................... True delay_param_gather .............................. False dim_model_base .................................. None dino_bottleneck_size ............................ 256 dino_freeze_last_layer .......................... 1 dino_head_hidden_size ........................... 2048 dino_local_crops_number ......................... 10 dino_local_img_size ............................. 96 dino_norm_last_layer ............................ False dino_teacher_temp ............................... 0.07 dino_warmup_teacher_temp ........................ 0.04 dino_warmup_teacher_temp_epochs ................. 30 dist_ckpt_format ................................ torch_dist distribute_saved_activations .................... False distributed_backend ............................. nccl distributed_timeout_minutes ..................... 120 dpo_beta ........................................ 0.1 dpo_ftx ......................................... 0.0 dpo_label_smoothing ............................. 0.0 dpo_loss_type ................................... sigmoid embed_layernorm ................................. False embedding_multiplier_scale ...................... 1.0 embedding_path .................................. None empty_unused_memory_level ....................... 0 enable_chunk_memory ............................. False enable_chunk_sequence ........................... False enable_hbmfault_repair .......................... False enable_high_availability ........................ False enable_one_logger ............................... False enable_optimizer_state_local_copy ............... False enable_recompute_layers_per_pp_rank ............. False enable_token_rearrange_opt ...................... False encoder_num_layers .............................. 48 encoder_seq_length .............................. 131072 end_weight_decay ................................ 0.0 eod_mask_loss ................................... False eval_interval ................................... 100 eval_iters ...................................... 0 evidence_data_path .............................. None exit_duration_in_mins ........................... None exit_interval ................................... None exit_on_missing_checkpoint ...................... False exit_signal_handler ............................. False expert_interval ................................. 1 expert_model_parallel_size ...................... 1 ffn_hidden_size ................................. 13824 fill_neg_inf .................................... False finetune ........................................ True first_k_dense_replace ........................... None first_pipeline_num_layers ....................... 0 fp16 ............................................ False fp16_lm_cross_entropy ........................... False fp32_residual_connection ........................ False fp8 ............................................. None fp8_amax_compute_algo ........................... most_recent fp8_amax_history_len ............................ 1 fp8_interval .................................... 1 fp8_margin ...................................... 0 fp8_wgrad ....................................... True full_shuffle_instruction_dataset ................ False geglu ........................................... False gelu_tanh ....................................... False global_batch_size ............................... 64 gradient_accumulation_fusion .................... False group_query_attention ........................... True head_lr_mult .................................... 1.0 hidden_dropout .................................. 0.0 hidden_size ..................................... 5120 high_freq_factor ................................ None hysteresis ...................................... 2 ict_head_size ................................... None ict_load ........................................ None image_size ...................................... 448 image_token_length .............................. 256 img_h ........................................... 224 img_w ........................................... 224 independent_parallel ............................ False indexer_batch_size .............................. 128 indexer_log_interval ............................ 1000 inference_batch_times_seqlen_threshold .......... 512 init_method_std ................................. 0.01 init_method_xavier_uniform ...................... False initial_loss_scale .............................. 4096.0 input_embeds_norm ............................... False input_jitter .................................... True input_layernorm_in_fp32 ......................... False interleave_sliding_window ....................... None is_instruction_dataset .......................... True is_pairwise_dataset ............................. False iter_per_epoch .................................. 1250 jit_compile ..................................... False kv_channels ..................................... 128 kv_head_repeat_before_uly_alltoall .............. True kv_lora_rank .................................... None language_model_freeze ........................... False lazy_mpu_init ................................... None load ............................................ /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ load_checkpoint_loosely ......................... False local_rank ...................................... None log_batch_size_to_tensorboard ................... False log_interval .................................... 1 log_learning_rate_to_tensorboard ................ True log_loss_scale_to_tensorboard ................... True log_memory_to_tensorboard ....................... False log_num_zeros_in_grad ........................... False log_params_norm ................................. False log_progress .................................... False log_throughput .................................. True log_timers_to_tensorboard ....................... False log_validation_ppl_to_tensorboard ............... False log_world_size_to_tensorboard ................... False logit_mask ...................................... False lora_alpha ...................................... 32 lora_fusion ..................................... False lora_load ....................................... None lora_modules_to_save ............................ None lora_r .......................................... 16 lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm'] lora_target_modules ............................. [] loss_scale ...................................... None loss_scale_window ............................... 1000 low_freq_factor ................................. None lr .............................................. 5e-06 lr_decay_iters .................................. None lr_decay_samples ................................ None lr_decay_style .................................. cosine lr_warmup_fraction .............................. 0.03 lr_warmup_init .................................. 0.0 lr_warmup_iters ................................. 0 lr_warmup_samples ............................... 0 make_vocab_size_divisible_by .................... 1 manual_gc ....................................... False manual_gc_eval .................................. True manual_gc_interval .............................. 0 mask_factor ..................................... 1.0 mask_prob ....................................... 0.15 mask_type ....................................... random masked_softmax_fusion ........................... False max_fps ......................................... 1 max_num_frame ................................... 512 max_num_image ................................... 8 max_patch_grid .................................. 12 max_position_embeddings ......................... 131072 max_tokens_to_oom ............................... 12000 merge_file ...................................... None micro_batch_size ................................ 1 min_loss_scale .................................. 1.0 min_lr .......................................... 1e-07 min_patch_grid .................................. 1 mmap_bin_files .................................. True mock_data ....................................... False moe_allgather_overlap_comm ...................... False moe_alltoall_overlap_comm ....................... False moe_aux_loss_coeff .............................. 0.0 moe_comm_aux_loss_coeff ......................... 0.0 moe_device_level_aux_loss_coeff ................. 0.0 moe_expert_capacity_factor ...................... None moe_grouped_gemm ................................ False moe_input_jitter_eps ............................ None moe_intermediate_size ........................... None moe_layer_freq .................................. None moe_pad_expert_input_to_capacity ................ False moe_per_layer_logging ........................... False moe_permutation_async_comm ...................... False moe_router_load_balancing_type .................. aux_loss moe_router_topk ................................. 2 moe_token_dispatcher_type ....................... allgather moe_token_drop_policy ........................... probs moe_token_dropping .............................. False moe_tp_extend_ep ................................ False moe_train_capacity_factor ....................... 1.0 moe_without_activation .......................... False moe_z_loss_coeff ................................ 0.0 moe_zero_memory ................................. disable multi_head_latent_attention ..................... False n_shared_experts ................................ None nccl_communicator_config_path ................... None next_tockens .................................... 0 no_load_optim ................................... True no_load_rng ..................................... True no_persist_layer_norm ........................... False no_post_layer_norm .............................. False no_save_optim ................................... None no_save_rng ..................................... None no_shared_storage ............................... False no_shuffle ...................................... False noisy_gate_policy ............................... None noop_layers ..................................... None norm_epsilon .................................... 1e-06 norm_topk_prob .................................. False normalization ................................... RMSNorm num_attention_heads ............................. 40 num_channels .................................... 3 num_classes ..................................... 1000 num_experts ..................................... None num_layer_list .................................. None num_layers ...................................... 48 num_layers_per_virtual_pipeline_stage ........... None num_query_groups ................................ 8 num_workers ..................................... 8 one_logger_entity ............................... hwinf_dcm one_logger_project .............................. e2e-tracking one_logger_run_name ............................. None onnx_safe ....................................... None openai_gelu ..................................... False optimizer ....................................... adam original_max_position_embeddings ................ None output_bert_embeddings .......................... False output_layer_slice_num .......................... 1 output_logit_softcapping ........................ None output_multiplier_scale ......................... None overlap_grad_reduce ............................. True overlap_p2p_comm ................................ False overlap_param_gather ............................ False override_opt_param_scheduler .................... False pad_to_multiple_of .............................. 8 padded_vocab_size ............................... 152064 params_dtype .................................... torch.bfloat16 patch_dim ....................................... 16 perform_initialization .......................... True pipeline_model_parallel_size .................... 1 pipeline_model_parallel_split_rank .............. None position_embedding_type ......................... rope post_norm ....................................... False pre_tockens ..................................... 65536 pref_ftx ........................................ 0.0 pretrained_checkpoint ........................... None profile ......................................... False profile_level ................................... level0 profile_ranks ................................... [-1] profile_record_shapes ........................... False profile_save_path ............................... ./profile_dir profile_step_end ................................ 12 profile_step_start .............................. 10 profile_with_cpu ................................ False profile_with_memory ............................. False profile_with_stack .............................. False prompt_format ................................... qwen2 prompt_type ..................................... None q_lora_rank ..................................... None qk_layernorm .................................... False qk_nope_head_dim ................................ None qk_rope_head_dim ................................ None query_in_block_prob ............................. 0.1 query_pre_attn_scalar ........................... None rampup_batch_size ............................... None rank ............................................ 0 recompute_activation_function ................... False recompute_activation_function_num_layers ........ None recompute_granularity ........................... full recompute_in_advance ............................ False recompute_in_bubble ............................. False recompute_method ................................ block recompute_num_layers ............................ 48 reduce_recompute_for_last_chunk ................. False ref_model ....................................... None reset_attention_mask ............................ False reset_position_ids .............................. False retriever_report_topk_accuracies ................ [] retriever_score_scaling ......................... False retriever_seq_length ............................ 256 retro_add_retriever ............................. False retro_attention_gate ............................ 1 retro_cyclic_train_iters ........................ None retro_encoder_attention_dropout ................. 0.1 retro_encoder_hidden_dropout .................... 0.1 retro_encoder_layers ............................ 2 retro_num_neighbors ............................. 2 retro_num_retrieved_chunks ...................... 2 retro_project_dir ............................... None retro_verify_neighbor_count ..................... True reuse_fp32_param ................................ False rope_scaling_beta_fast .......................... 32 rope_scaling_beta_slow .......................... 1 rope_scaling_factor ............................. 1.0 rope_scaling_mscale ............................. 1.0 rope_scaling_mscale_all_dim ..................... 0.0 rope_scaling_original_max_position_embeddings ... None rope_scaling_type ............................... None rotary_base ..................................... 1000000.0 rotary_interleaved .............................. False rotary_percent .................................. 1.0 rotary_seq_len_interpolation_factor ............. None routed_scaling_factor ........................... None sample_rate ..................................... 1.0 save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// save_interval ................................... 100 scale_depth ..................................... None scale_emb ....................................... None scatter_gather_tensors_in_pipeline .............. True seed ............................................ 424242 seq_aux ......................................... False seq_length ...................................... 131072 sequence_parallel ............................... True sgd_momentum .................................... 0.9 shape_order ..................................... SBH shared_expert_gate .............................. False shared_expert_gate_output_dimension ............. 1 short_seq_prob .................................. 0.1 skip_bias_add ................................... True skip_train ...................................... False sliding_window .................................. None sparse_mode ..................................... 4 spec ............................................ None split ........................................... 100,0,0 square_alibi_mask ............................... False squared_relu .................................... False stage ........................................... None standalone_embedding_stage ...................... False start_weight_decay .............................. 0.0 swap_attention .................................. False swap_modules .................................... None swiglu .......................................... True swin_backbone_type .............................. tiny tensor_model_parallel_size ...................... 8 tensorboard_dir ................................. None tensorboard_log_interval ........................ 1 tensorboard_queue_size .......................... 1000 test_data_path .................................. None test_mode ....................................... False timing_log_level ................................ 0 timing_log_option ............................... minmax titles_data_path ................................ None tokenizer_kwargs ................................ None tokenizer_model ................................. None tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/ tokenizer_not_use_fast .......................... True tokenizer_padding_side .......................... right tokenizer_type .................................. PretrainedFromHF topk_group ...................................... None tp_2d ........................................... False tp_comm_bulk_dgrad .............................. True tp_comm_bulk_wgrad .............................. True tp_comm_overlap ................................. False tp_comm_overlap_ag .............................. True tp_comm_overlap_cfg ............................. None tp_comm_overlap_rs .............................. True tp_comm_split_ag ................................ True tp_comm_split_rs ................................ True tp_x ............................................ 1 tp_y ............................................ 1 train_data_path ................................. None train_iters ..................................... 1000 train_samples ................................... None transformer_impl ................................ local transformer_pipeline_model_parallel_size ........ 1 ulysses_degree_in_cp ............................ None untie_embeddings_and_output_weights ............. True use_checkpoint_args ............................. False use_checkpoint_opt_param_scheduler .............. False use_cp_send_recv_overlap ........................ True use_cpu_initialization .......................... None use_deter_comp .................................. False use_dist_ckpt ................................... False use_distributed_optimizer ....................... True use_flash_attn .................................. True use_fused_moe_token_permute_and_unpermute ....... False use_fused_ring_attention_update ................. False use_fused_rmsnorm ............................... True use_fused_rotary_pos_emb ........................ True use_fused_swiglu ................................ True use_glm_rope .................................... False use_mc2 ......................................... False use_mcore_models ................................ True use_one_sent_docs ............................... False use_ring_exchange_p2p ........................... False use_rotary_position_embeddings .................. True v_head_dim ...................................... None valid_data_path ................................. None variable_seq_lengths ............................ False virtual_pipeline_model_parallel_size ............ None vision_backbone_type ............................ vit vision_context_parallel ......................... False vision_downsample_ratio ......................... 0.5 vision_downsample_stride ........................ 1.0 vision_model_freeze ............................. True vision_model_lr_decay_rate ...................... 1.0 vision_model_lr_mult ............................ 1.0 vision_model_recompute .......................... False vision_model_type ............................... intern_300m vision_normalize_type ........................... imagenet vision_pretraining .............................. False vision_pretraining_type ......................... classify vision_process_type ............................. dynamic vision_projector_freeze ......................... False vision_projector_pre_norm ....................... True vision_projector_recompute ...................... False vision_projector_type ........................... mlp vision_seq_length ............................... 1025 vit_load ........................................ / vocab_extra_ids ................................. 0 vocab_file ...................................... None vocab_size ...................................... None wandb_exp_name .................................. wandb_project ................................... wandb_save_dir .................................. weight_decay .................................... 0.0 weight_decay_incr_style ......................... constant world_size ...................................... 512 yaml_cfg ........................................ None -------------------- end of ModelLink Arguments --------------------- setting number of micro-batches to constant 2 > building PretrainFromHF tokenizer. Vocab file is un-used, loading tokenizer from pre-trained model > initializing torch distributed ... [W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) [W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) [W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) [W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) [W ProcessGroupHCCL.cpp:678] Warning: The HCCL execution timeout 7200000ms is bigger than watchdog timeout 1800000ms which is set by init_process_group! The plog may not be recorded. (function ProcessGroupHCCL) all tp groups [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]] all ep groups [[0], [8], [16], [24], [32], [40], [48], [56], [64], [72], [80], [88], [96], [104], [112], [120], [128], [136], [144], [152], [160], [168], [176], [184], [192], [200], [208], [216], [224], [232], [240], [248], [256], [264], [272], [280], [288], [296], [304], [312], [320], [328], [336], [344], [352], [360], [368], [376], [384], [392], [400], [408], [416], [424], [432], [440], [448], [456], [464], [472], [480], [488], [496], [504], [1], [9], [17], [25], [33], [41], [49], [57], [65], [73], [81], [89], [97], [105], [113], [121], [129], [137], [145], [153], [161], [169], [177], [185], [193], [201], [209], [217], [225], [233], [241], [249], [257], [265], [273], [281], [289], [297], [305], [313], [321], [329], [337], [345], [353], [361], [369], [377], [385], [393], [401], [409], [417], [425], [433], [441], [449], [457], [465], [473], [481], [489], [497], [505], [2], [10], [18], [26], [34], [42], [50], [58], [66], [74], [82], [90], [98], [106], [114], [122], [130], [138], [146], [154], [162], [170], [178], [186], [194], [202], [210], [218], [226], [234], [242], [250], [258], [266], [274], [282], [290], [298], [306], [314], [322], [330], [338], [346], [354], [362], [370], [378], [386], [394], [402], [410], [418], [426], [434], [442], [450], [458], [466], [474], [482], [490], [498], [506], [3], [11], [19], [27], [35], [43], [51], [59], [67], [75], [83], [91], [99], [107], [115], [123], [131], [139], [147], [155], [163], [171], [179], [187], [195], [203], [211], [219], [227], [235], [243], [251], [259], [267], [275], [283], [291], [299], [307], [315], [323], [331], [339], [347], [355], [363], [371], [379], [387], [395], [403], [411], [419], [427], [435], [443], [451], [459], [467], [475], [483], [491], [499], [507], [4], [12], [20], [28], [36], [44], [52], [60], [68], [76], [84], [92], [100], [108], [116], [124], [132], [140], [148], [156], [164], [172], [180], [188], [196], [204], [212], [220], [228], [236], [244], [252], [260], [268], [276], [284], [292], [300], [308], [316], [324], [332], [340], [348], [356], [364], [372], [380], [388], [396], [404], [412], [420], [428], [436], [444], [452], [460], [468], [476], [484], [492], [500], [508], [5], [13], [21], [29], [37], [45], [53], [61], [69], [77], [85], [93], [101], [109], [117], [125], [133], [141], [149], [157], [165], [173], [181], [189], [197], [205], [213], [221], [229], [237], [245], [253], [261], [269], [277], [285], [293], [301], [309], [317], [325], [333], [341], [349], [357], [365], [373], [381], [389], [397], [405], [413], [421], [429], [437], [445], [453], [461], [469], [477], [485], [493], [501], [509], [6], [14], [22], [30], [38], [46], [54], [62], [70], [78], [86], [94], [102], [110], [118], [126], [134], [142], [150], [158], [166], [174], [182], [190], [198], [206], [214], [222], [230], [238], [246], [254], [262], [270], [278], [286], [294], [302], [310], [318], [326], [334], [342], [350], [358], [366], [374], [382], [390], [398], [406], [414], [422], [430], [438], [446], [454], [462], [470], [478], [486], [494], [502], [510], [7], [15], [23], [31], [39], [47], [55], [63], [71], [79], [87], [95], [103], [111], [119], [127], [135], [143], [151], [159], [167], [175], [183], [191], [199], [207], [215], [223], [231], [239], [247], [255], [263], [271], [279], [287], [295], [303], [311], [319], [327], [335], [343], [351], [359], [367], [375], [383], [391], [399], [407], [415], [423], [431], [439], [447], [455], [463], [471], [479], [487], [495], [503], [511]] all dp groups [[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496], [1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497], [2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498], [3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499], [4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500], [5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501], [6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502], [7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503], [8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504], [9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505], [10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506], [11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507], [12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508], [13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509], [14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510], [15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511]] all_dp_modulo_exp_group_ranks [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]] all_tensor_and_expert_group_ranks [[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71], [72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87], [88, 89, 90, 91, 92, 93, 94, 95], [96, 97, 98, 99, 100, 101, 102, 103], [104, 105, 106, 107, 108, 109, 110, 111], [112, 113, 114, 115, 116, 117, 118, 119], [120, 121, 122, 123, 124, 125, 126, 127], [128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143], [144, 145, 146, 147, 148, 149, 150, 151], [152, 153, 154, 155, 156, 157, 158, 159], [160, 161, 162, 163, 164, 165, 166, 167], [168, 169, 170, 171, 172, 173, 174, 175], [176, 177, 178, 179, 180, 181, 182, 183], [184, 185, 186, 187, 188, 189, 190, 191], [192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207], [208, 209, 210, 211, 212, 213, 214, 215], [216, 217, 218, 219, 220, 221, 222, 223], [224, 225, 226, 227, 228, 229, 230, 231], [232, 233, 234, 235, 236, 237, 238, 239], [240, 241, 242, 243, 244, 245, 246, 247], [248, 249, 250, 251, 252, 253, 254, 255], [256, 257, 258, 259, 260, 261, 262, 263], [264, 265, 266, 267, 268, 269, 270, 271], [272, 273, 274, 275, 276, 277, 278, 279], [280, 281, 282, 283, 284, 285, 286, 287], [288, 289, 290, 291, 292, 293, 294, 295], [296, 297, 298, 299, 300, 301, 302, 303], [304, 305, 306, 307, 308, 309, 310, 311], [312, 313, 314, 315, 316, 317, 318, 319], [320, 321, 322, 323, 324, 325, 326, 327], [328, 329, 330, 331, 332, 333, 334, 335], [336, 337, 338, 339, 340, 341, 342, 343], [344, 345, 346, 347, 348, 349, 350, 351], [352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366, 367], [368, 369, 370, 371, 372, 373, 374, 375], [376, 377, 378, 379, 380, 381, 382, 383], [384, 385, 386, 387, 388, 389, 390, 391], [392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406, 407], [408, 409, 410, 411, 412, 413, 414, 415], [416, 417, 418, 419, 420, 421, 422, 423], [424, 425, 426, 427, 428, 429, 430, 431], [432, 433, 434, 435, 436, 437, 438, 439], [440, 441, 442, 443, 444, 445, 446, 447], [448, 449, 450, 451, 452, 453, 454, 455], [456, 457, 458, 459, 460, 461, 462, 463], [464, 465, 466, 467, 468, 469, 470, 471], [472, 473, 474, 475, 476, 477, 478, 479], [480, 481, 482, 483, 484, 485, 486, 487], [488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 501, 502, 503], [504, 505, 506, 507, 508, 509, 510, 511]] all_data_parallel_group_ranks_with_cp [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504], [1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 257, 265, 273, 281, 289, 297, 305, 313, 321, 329, 337, 345, 353, 361, 369, 377, 385, 393, 401, 409, 417, 425, 433, 441, 449, 457, 465, 473, 481, 489, 497, 505], [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 258, 266, 274, 282, 290, 298, 306, 314, 322, 330, 338, 346, 354, 362, 370, 378, 386, 394, 402, 410, 418, 426, 434, 442, 450, 458, 466, 474, 482, 490, 498, 506], [3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 259, 267, 275, 283, 291, 299, 307, 315, 323, 331, 339, 347, 355, 363, 371, 379, 387, 395, 403, 411, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507], [4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 260, 268, 276, 284, 292, 300, 308, 316, 324, 332, 340, 348, 356, 364, 372, 380, 388, 396, 404, 412, 420, 428, 436, 444, 452, 460, 468, 476, 484, 492, 500, 508], [5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 261, 269, 277, 285, 293, 301, 309, 317, 325, 333, 341, 349, 357, 365, 373, 381, 389, 397, 405, 413, 421, 429, 437, 445, 453, 461, 469, 477, 485, 493, 501, 509], [6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 270, 278, 286, 294, 302, 310, 318, 326, 334, 342, 350, 358, 366, 374, 382, 390, 398, 406, 414, 422, 430, 438, 446, 454, 462, 470, 478, 486, 494, 502, 510], [7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, 263, 271, 279, 287, 295, 303, 311, 319, 327, 335, 343, 351, 359, 367, 375, 383, 391, 399, 407, 415, 423, 431, 439, 447, 455, 463, 471, 479, 487, 495, 503, 511]] > initialized tensor model parallel with size 8 > initialized pipeline model parallel with size 1 > setting random seeds to 424242 ... > compiling dataset index builder ... make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' make: Nothing to be done for 'default'. make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' >>> done with dataset index builder. Compilation time: 0.074 seconds time to initialize megatron (seconds): -34.653 [after megatron is initialized] datetime: 2024-11-27 12:45:17 building GPT model ... Building megatron mcore vision language model ... vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute Falsevision_projector_recompute Falsevision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False model_provider args Namespace(accumulate_allreduce_grads_in_fp32=True, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, adaptive_cp_dynamic_attn_mask=False, adaptive_cp_manually_set_mask_list=False, adaptive_cp_only_reschedule=False, adaptive_cp_without_coarse=False, adaptive_recompute_device_size=-1, adaptive_recompute_device_swap=False, adaptive_recompute_profiling_step=10, add_bias_linear=False, add_class_token=True, add_dense_bias=False, add_position_embedding=True, add_qkv_bias=True, add_rmsnorm_offset=False, adlr_autoresume=False, adlr_autoresume_interval=1000, apply_layernorm_1p=False, apply_query_key_layer_scaling=False, apply_residual_connection_post_layernorm=False, apply_rope_fusion=True, async_tensor_model_parallel_allreduce=False, attention_dropout=0.0, attention_mask_on_cpu=False, attention_softmax_in_fp32=True, attn_logit_softcapping=None, auto_detect_ckpt_format=False, barrier_with_L1_time=True, bert_binary_head=True, bert_embedder_type='megatron', bert_load=None, bf16=True, bias_dropout_fusion=True, bias_gelu_fusion=False, bias_swiglu_fusion=True, biencoder_projection_dim=0, biencoder_shared_query_context_model=False, block_data_path=None, check_for_nan_in_loss_and_grad=True, chunk_size=4096, ckpt_fully_parallel_save=False, ckpt_step=None, classes_fraction=1.0, clip_grad=1.0, clone_scatter_output_in_embedding=True, consumed_train_samples=0, consumed_valid_samples=0, context_parallel_algo='megatron_cp_algo', context_parallel_size=2, cp_attention_mask_type='causal', cp_window_size=1, create_attention_mask_in_dataloader=False, cross_dataset_joint=True, data_cache_path=None, data_parallel_random_init=False, data_parallel_size=32, data_path=['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'], data_per_class_fraction=1.0, data_seq_length=131072, data_sharding=True, dataloader_type='single', decoder_num_layers=None, decoder_seq_length=None, decoupled_lr=None, decoupled_min_lr=None, delay_grad_reduce=True, delay_param_gather=False, dim_model_base=None, dino_bottleneck_size=256, dino_freeze_last_layer=1, dino_head_hidden_size=2048, dino_local_crops_number=10, dino_local_img_size=96, dino_norm_last_layer=False, dino_teacher_temp=0.07, dino_warmup_teacher_temp=0.04, dino_warmup_teacher_temp_epochs=30, dist_ckpt_format='torch_dist', distribute_saved_activations=False, distributed_backend='nccl', distributed_timeout_minutes=120, dpo_beta=0.1, dpo_ftx=0.0, dpo_label_smoothing=0.0, dpo_loss_type='sigmoid', embed_layernorm=False, embedding_multiplier_scale=1.0, embedding_path=None, empty_unused_memory_level=0, enable_chunk_memory=False, enable_chunk_sequence=False, enable_hbmfault_repair=False, enable_high_availability=False, enable_one_logger=False, enable_optimizer_state_local_copy=False, enable_recompute_layers_per_pp_rank=False, enable_token_rearrange_opt=False, encoder_num_layers=48, encoder_seq_length=131072, end_weight_decay=0.0, eod_mask_loss=False, eval_interval=100, eval_iters=0, evidence_data_path=None, exit_duration_in_mins=None, exit_interval=None, exit_on_missing_checkpoint=False, exit_signal_handler=False, expert_interval=1, expert_model_parallel_size=1, ffn_hidden_size=13824, fill_neg_inf=False, finetune=True, first_k_dense_replace=None, first_pipeline_num_layers=0, fp16=False, fp16_lm_cross_entropy=False, fp32_residual_connection=False, fp8=None, fp8_amax_compute_algo='most_recent', fp8_amax_history_len=1, fp8_interval=1, fp8_margin=0, fp8_wgrad=True, full_shuffle_instruction_dataset=False, geglu=False, gelu_tanh=False, global_batch_size=64, gradient_accumulation_fusion=False, group_query_attention=True, head_lr_mult=1.0, hidden_dropout=0.0, hidden_size=5120, high_freq_factor=None, hysteresis=2, ict_head_size=None, ict_load=None, image_size=448, image_token_length=256, img_h=224, img_w=224, independent_parallel=False, indexer_batch_size=128, indexer_log_interval=1000, inference_batch_times_seqlen_threshold=512, init_method_std=0.01, init_method_xavier_uniform=False, initial_loss_scale=4096.0, input_embeds_norm=False, input_jitter=True, input_layernorm_in_fp32=False, interleave_sliding_window=None, is_instruction_dataset=True, is_pairwise_dataset=False, iter_per_epoch=1250, jit_compile=False, kv_channels=128, kv_head_repeat_before_uly_alltoall=True, kv_lora_rank=None, language_model_freeze=False, lazy_mpu_init=None, load='/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/', load_checkpoint_loosely=False, local_rank=0, log_batch_size_to_tensorboard=False, log_interval=1, log_learning_rate_to_tensorboard=True, log_loss_scale_to_tensorboard=True, log_memory_to_tensorboard=False, log_num_zeros_in_grad=False, log_params_norm=False, log_progress=False, log_throughput=True, log_timers_to_tensorboard=False, log_validation_ppl_to_tensorboard=False, log_world_size_to_tensorboard=False, logit_mask=False, lora_alpha=32, lora_fusion=False, lora_load=None, lora_modules_to_save=None, lora_r=16, lora_register_forward_hook=['word_embeddings', 'input_layernorm'], lora_target_modules=[], loss_scale=None, loss_scale_window=1000, low_freq_factor=None, lr=5e-06, lr_decay_iters=None, lr_decay_samples=None, lr_decay_style='cosine', lr_warmup_fraction=0.03, lr_warmup_init=0.0, lr_warmup_iters=0, lr_warmup_samples=0, make_vocab_size_divisible_by=1, manual_gc=False, manual_gc_eval=True, manual_gc_interval=0, mask_factor=1.0, mask_prob=0.15, mask_type='random', masked_softmax_fusion=False, max_fps=1, max_num_frame=512, max_num_image=8, max_patch_grid=12, max_position_embeddings=131072, max_tokens_to_oom=12000, merge_file=None, micro_batch_size=1, min_loss_scale=1.0, min_lr=1e-07, min_patch_grid=1, mmap_bin_files=True, mock_data=False, model_type=, moe_allgather_overlap_comm=False, moe_alltoall_overlap_comm=False, moe_aux_loss_coeff=0.0, moe_comm_aux_loss_coeff=0.0, moe_device_level_aux_loss_coeff=0.0, moe_expert_capacity_factor=None, moe_grouped_gemm=False, moe_input_jitter_eps=None, moe_intermediate_size=None, moe_layer_freq=None, moe_pad_expert_input_to_capacity=False, moe_per_layer_logging=False, moe_permutation_async_comm=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_token_dispatcher_type='allgather', moe_token_drop_policy='probs', moe_token_dropping=False, moe_tp_extend_ep=False, moe_train_capacity_factor=1.0, moe_without_activation=False, moe_z_loss_coeff=0.0, moe_zero_memory='disable', multi_head_latent_attention=False, n_shared_experts=None, nccl_communicator_config_path=None, next_tockens=0, no_load_optim=True, no_load_rng=True, no_persist_layer_norm=False, no_post_layer_norm=False, no_save_optim=None, no_save_rng=None, no_shared_storage=False, no_shuffle=False, noisy_gate_policy=None, noop_layers=None, norm_epsilon=1e-06, norm_topk_prob=False, normalization='RMSNorm', num_attention_heads=40, num_channels=3, num_classes=1000, num_experts=None, num_layer_list=None, num_layers=48, num_layers_per_virtual_pipeline_stage=None, num_query_groups=8, num_workers=8, one_logger_entity='hwinf_dcm', one_logger_project='e2e-tracking', one_logger_run_name=None, onnx_safe=None, openai_gelu=False, optimizer='adam', original_max_position_embeddings=None, output_bert_embeddings=False, output_layer_slice_num=1, output_logit_softcapping=None, output_multiplier_scale=None, overlap_grad_reduce=True, overlap_p2p_comm=False, overlap_param_gather=False, override_opt_param_scheduler=False, pad_to_multiple_of=8, padded_vocab_size=152064, params_dtype=torch.bfloat16, patch_dim=16, perform_initialization=True, pipeline_model_parallel_size=1, pipeline_model_parallel_split_rank=None, position_embedding_type='rope', post_norm=False, pre_tockens=65536, pref_ftx=0.0, pretrained_checkpoint=None, profile=False, profile_level='level0', profile_ranks=[-1], profile_record_shapes=False, profile_save_path='./profile_dir', profile_step_end=12, profile_step_start=10, profile_with_cpu=False, profile_with_memory=False, profile_with_stack=False, prompt_format='qwen2', prompt_type=None, q_lora_rank=None, qk_layernorm=False, qk_nope_head_dim=None, qk_rope_head_dim=None, query_in_block_prob=0.1, query_pre_attn_scalar=None, rampup_batch_size=None, rank=0, recompute_activation_function=False, recompute_activation_function_num_layers=None, recompute_granularity='full', recompute_in_advance=False, recompute_in_bubble=False, recompute_method='block', recompute_num_layers=48, reduce_recompute_for_last_chunk=False, ref_model=None, reset_attention_mask=False, reset_position_ids=False, retriever_report_topk_accuracies=[], retriever_score_scaling=False, retriever_seq_length=256, retro_add_retriever=False, retro_attention_gate=1, retro_cyclic_train_iters=None, retro_encoder_attention_dropout=0.1, retro_encoder_hidden_dropout=0.1, retro_encoder_layers=2, retro_num_neighbors=2, retro_num_retrieved_chunks=2, retro_project_dir=None, retro_verify_neighbor_count=True, reuse_fp32_param=False, rope_scaling_beta_fast=32, rope_scaling_beta_slow=1, rope_scaling_factor=1.0, rope_scaling_mscale=1.0, rope_scaling_mscale_all_dim=0.0, rope_scaling_original_max_position_embeddings=None, rope_scaling_type=None, rotary_base=1000000.0, rotary_interleaved=False, rotary_percent=1.0, rotary_seq_len_interpolation_factor=None, routed_scaling_factor=None, sample_rate=1.0, save='/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//', save_interval=100, scale_depth=None, scale_emb=None, scatter_gather_tensors_in_pipeline=True, seed=424242, seq_aux=False, seq_length=131072, sequence_parallel=True, sgd_momentum=0.9, shape_order='SBH', shared_expert_gate=False, shared_expert_gate_output_dimension=1, short_seq_prob=0.1, skip_bias_add=True, skip_train=False, sliding_window=None, sparse_mode=4, spec=None, split='100,0,0', square_alibi_mask=False, squared_relu=False, stage=None, standalone_embedding_stage=False, start_weight_decay=0.0, swap_attention=False, swap_modules=None, swiglu=True, swin_backbone_type='tiny', tensor_model_parallel_size=8, tensorboard_dir=None, tensorboard_log_interval=1, tensorboard_queue_size=1000, test_data_path=None, test_mode=False, timing_log_level=0, timing_log_option='minmax', titles_data_path=None, tokenizer_kwargs=None, tokenizer_model=None, tokenizer_name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', tokenizer_not_use_fast=True, tokenizer_padding_side='right', tokenizer_type='PretrainedFromHF', topk_group=None, tp_2d=False, tp_comm_bulk_dgrad=True, tp_comm_bulk_wgrad=True, tp_comm_overlap=False, tp_comm_overlap_ag=True, tp_comm_overlap_cfg=None, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_split_rs=True, tp_x=1, tp_y=1, train_data_path=None, train_iters=1000, train_samples=None, transformer_impl='local', transformer_pipeline_model_parallel_size=1, ulysses_degree_in_cp=None, untie_embeddings_and_output_weights=True, use_checkpoint_args=False, use_checkpoint_opt_param_scheduler=False, use_cp_send_recv_overlap=True, use_cpu_initialization=None, use_deter_comp=False, use_dist_ckpt=False, use_distributed_optimizer=True, use_flash_attn=True, use_fused_moe_token_permute_and_unpermute=False, use_fused_ring_attention_update=False, use_fused_rmsnorm=True, use_fused_rotary_pos_emb=True, use_fused_swiglu=True, use_glm_rope=False, use_mc2=False, use_mcore_models=True, use_one_sent_docs=False, use_ring_exchange_p2p=False, use_rotary_position_embeddings=True, v_head_dim=None, valid_data_path=None, variable_seq_lengths=False, virtual_pipeline_model_parallel_size=None, vision_backbone_type='vit', vision_context_parallel=False, vision_downsample_ratio=0.5, vision_downsample_stride=1.0, vision_model_freeze=True, vision_model_lr_decay_rate=1.0, vision_model_lr_mult=1.0, vision_model_recompute=False, vision_model_type='intern_300m', vision_normalize_type='imagenet', vision_pretraining=False, vision_pretraining_type='classify', vision_process_type='dynamic', vision_projector_freeze=False, vision_projector_pre_norm=True, vision_projector_recompute=False, vision_projector_type='mlp', vision_seq_length=1025, vit_load='/', vocab_extra_ids=0, vocab_file=None, vocab_size=None, wandb_exp_name='', wandb_project='', wandb_save_dir='', weight_decay=0.0, weight_decay_incr_style='constant', world_size=512, yaml_cfg=None)vision_projector_recompute False model_provider config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=13824, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb2591db310>, output_layer_init_method=.init_ at 0x7fb23001b310>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=True, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0) vision_projector_recompute Falsemodel_provider transformer_layer_spec ModuleSpec(module=, params={}, submodules=TransformerLayerSubmodules(input_layernorm=, self_attention=ModuleSpec(module=, params={'attn_mask_type': }, submodules=SelfAttentionSubmodules(linear_qkv=, core_attention=, linear_proj=, q_layernorm=, k_layernorm=)), post_attn_norm=, self_attn_bda=, pre_cross_attn_layernorm=, cross_attention=, cross_attn_bda=, pre_mlp_layernorm=, mlp=ModuleSpec(module=, params={}, submodules=MLPSubmodules(linear_fc1=, linear_fc2=)), post_mlp_layernorm=, mlp_bda=, sharded_state_dict_keys_map={'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_'})) vision_projector_recompute False Building intern_300m model ... ------------------------ vit_args ------------------------ accumulate_allreduce_grads_in_fp32 .............. True activation_func ................................. adam_beta1 ...................................... 0.9 adam_beta2 ...................................... 0.999 adam_eps ........................................ 1e-08 adaptive_cp_dynamic_attn_mask ................... False adaptive_cp_manually_set_mask_list .............. False adaptive_cp_only_reschedule ..................... False adaptive_cp_without_coarse ...................... False adaptive_recompute_device_size .................. -1 adaptive_recompute_device_swap .................. False adaptive_recompute_profiling_step ............... 10 add_bias_linear ................................. True add_class_token ................................. True add_dense_bias .................................. False add_position_embedding .......................... True add_qkv_bias .................................... True add_rmsnorm_offset .............................. False adlr_autoresume ................................. False adlr_autoresume_interval ........................ 1000 apply_layernorm_1p .............................. False apply_query_key_layer_scaling ................... False apply_residual_connection_post_layernorm ........ False apply_rope_fusion ............................... False async_tensor_model_parallel_allreduce ........... False attention_dropout ............................... 0.0 attention_mask_on_cpu ........................... False attention_softmax_in_fp32 ....................... True attn_logit_softcapping .......................... None auto_detect_ckpt_format ......................... False barrier_with_L1_time ............................ True bert_binary_head ................................ True bert_embedder_type .............................. megatron bert_load ....................................... None bf16 ............................................ True bias_activation_fusion .......................... False bias_dropout_fusion ............................. False bias_gelu_fusion ................................ False bias_swiglu_fusion .............................. True biencoder_projection_dim ........................ 0 biencoder_shared_query_context_model ............ False block_data_path ................................. None check_for_nan_in_loss_and_grad .................. True chunk_size ...................................... 4096 ckpt_fully_parallel_save ........................ False ckpt_step ....................................... None classes_fraction ................................ 1.0 clip_grad ....................................... 1.0 clone_scatter_output_in_embedding ............... True consumed_train_samples .......................... 0 consumed_valid_samples .......................... 0 context_parallel_algo ........................... megatron_cp_algo context_parallel_size ........................... 1 cp_attention_mask_type .......................... causal cp_window_size .................................. 1 create_attention_mask_in_dataloader ............. False cross_dataset_joint ............................. True data_cache_path ................................. None data_parallel_random_init ....................... False data_parallel_size .............................. 32 data_path ....................................... ['/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml'] data_per_class_fraction ......................... 1.0 data_seq_length ................................. 131072 data_sharding ................................... True dataloader_type ................................. single decoder_num_layers .............................. None decoder_seq_length .............................. 1025 decoupled_lr .................................... None decoupled_min_lr ................................ None delay_grad_reduce ............................... True delay_param_gather .............................. False dim_model_base .................................. None dino_bottleneck_size ............................ 256 dino_freeze_last_layer .......................... 1 dino_head_hidden_size ........................... 2048 dino_local_crops_number ......................... 10 dino_local_img_size ............................. 96 dino_norm_last_layer ............................ False dino_teacher_temp ............................... 0.07 dino_warmup_teacher_temp ........................ 0.04 dino_warmup_teacher_temp_epochs ................. 30 dist_ckpt_format ................................ torch_dist distribute_saved_activations .................... False distributed_backend ............................. nccl distributed_timeout_minutes ..................... 120 dpo_beta ........................................ 0.1 dpo_ftx ......................................... 0.0 dpo_label_smoothing ............................. 0.0 dpo_loss_type ................................... sigmoid embed_layernorm ................................. False embedding_multiplier_scale ...................... 1.0 embedding_path .................................. None empty_unused_memory_level ....................... 0 enable_chunk_memory ............................. False enable_chunk_sequence ........................... False enable_hbmfault_repair .......................... False enable_high_availability ........................ False enable_one_logger ............................... False enable_optimizer_state_local_copy ............... False enable_recompute_layers_per_pp_rank ............. False enable_token_rearrange_opt ...................... False encoder_num_layers .............................. 48 encoder_seq_length .............................. 1025 end_weight_decay ................................ 0.0 eod_mask_loss ................................... False eval_interval ................................... 100 eval_iters ...................................... 0 evidence_data_path .............................. None exit_duration_in_mins ........................... None exit_interval ................................... None exit_on_missing_checkpoint ...................... False exit_signal_handler ............................. False expert_interval ................................. 1 expert_model_parallel_size ...................... 1 ffn_hidden_size ................................. 4096 fill_neg_inf .................................... False finetune ........................................ True first_k_dense_replace ........................... None first_pipeline_num_layers ....................... 0 fp16 ............................................ False fp16_lm_cross_entropy ........................... False fp32_residual_connection ........................ False fp8 ............................................. None fp8_amax_compute_algo ........................... most_recent fp8_amax_history_len ............................ 1 fp8_interval .................................... 1 fp8_margin ...................................... 0 fp8_wgrad ....................................... True full_shuffle_instruction_dataset ................ False gated_linear_unit ............................... False geglu ........................................... False gelu_tanh ....................................... False global_batch_size ............................... 64 gradient_accumulation_fusion .................... False group_query_attention ........................... False head_lr_mult .................................... 1.0 hidden_dropout .................................. 0.0 hidden_size ..................................... 1024 high_freq_factor ................................ None hysteresis ...................................... 2 ict_head_size ................................... None ict_load ........................................ None image_size ...................................... 448 image_token_length .............................. 256 img_h ........................................... 448 img_w ........................................... 448 independent_parallel ............................ True indexer_batch_size .............................. 128 indexer_log_interval ............................ 1000 inference_batch_times_seqlen_threshold .......... 512 init_method_std ................................. 0.01 init_method_xavier_uniform ...................... False initial_loss_scale .............................. 4096.0 input_embeds_norm ............................... False input_jitter .................................... True input_layernorm_in_fp32 ......................... False interleave_sliding_window ....................... None is_instruction_dataset .......................... True is_pairwise_dataset ............................. False iter_per_epoch .................................. 1250 jit_compile ..................................... False kv_channels ..................................... 64 kv_head_repeat_before_uly_alltoall .............. True kv_lora_rank .................................... None language_model_freeze ........................... False layernorm_zero_centered_gamma ................... False lazy_mpu_init ................................... None load ............................................ / load_checkpoint_loosely ......................... False local_rank ...................................... 0 log_batch_size_to_tensorboard ................... False log_interval .................................... 1 log_learning_rate_to_tensorboard ................ True log_loss_scale_to_tensorboard ................... True log_memory_to_tensorboard ....................... False log_num_zeros_in_grad ........................... False log_params_norm ................................. False log_progress .................................... False log_throughput .................................. True log_timers_to_tensorboard ....................... False log_validation_ppl_to_tensorboard ............... False log_world_size_to_tensorboard ................... False logit_mask ...................................... False lora_alpha ...................................... 32 lora_fusion ..................................... False lora_load ....................................... None lora_modules_to_save ............................ None lora_r .......................................... 16 lora_register_forward_hook ...................... ['word_embeddings', 'input_layernorm'] lora_target_modules ............................. [] loss_scale ...................................... None loss_scale_window ............................... 1000 low_freq_factor ................................. None lr .............................................. 5e-06 lr_decay_iters .................................. None lr_decay_samples ................................ None lr_decay_style .................................. cosine lr_warmup_fraction .............................. 0.03 lr_warmup_init .................................. 0.0 lr_warmup_iters ................................. 0 lr_warmup_samples ............................... 0 make_vocab_size_divisible_by .................... 1 manual_gc ....................................... False manual_gc_eval .................................. True manual_gc_interval .............................. 0 mask_factor ..................................... 1.0 mask_prob ....................................... 0.15 mask_type ....................................... random masked_softmax_fusion ........................... False max_fps ......................................... 1 max_num_frame ................................... 512 max_num_image ................................... 8 max_patch_grid .................................. 12 max_position_embeddings ......................... 1025 max_tokens_to_oom ............................... 12000 merge_file ...................................... None micro_batch_size ................................ 1 min_loss_scale .................................. 1.0 min_lr .......................................... 1e-07 min_patch_grid .................................. 1 mmap_bin_files .................................. True mock_data ....................................... False model_type ...................................... ModelType.encoder_or_decoder moe_allgather_overlap_comm ...................... False moe_alltoall_overlap_comm ....................... False moe_aux_loss_coeff .............................. 0.0 moe_comm_aux_loss_coeff ......................... 0.0 moe_device_level_aux_loss_coeff ................. 0.0 moe_expert_capacity_factor ...................... None moe_grouped_gemm ................................ False moe_input_jitter_eps ............................ None moe_intermediate_size ........................... None moe_layer_freq .................................. None moe_pad_expert_input_to_capacity ................ False moe_per_layer_logging ........................... False moe_permutation_async_comm ...................... False moe_router_load_balancing_type .................. aux_loss moe_router_topk ................................. 2 moe_token_dispatcher_type ....................... allgather moe_token_drop_policy ........................... probs moe_token_dropping .............................. False moe_tp_extend_ep ................................ False moe_train_capacity_factor ....................... 1.0 moe_without_activation .......................... False moe_z_loss_coeff ................................ 0.0 moe_zero_memory ................................. disable multi_head_latent_attention ..................... False n_shared_experts ................................ None nccl_communicator_config_path ................... None next_tockens .................................... 0 no_load_optim ................................... True no_load_rng ..................................... True no_persist_layer_norm ........................... False no_post_layer_norm .............................. False no_save_optim ................................... None no_save_rng ..................................... None no_shared_storage ............................... False no_shuffle ...................................... False noisy_gate_policy ............................... None noop_layers ..................................... None norm_epsilon .................................... 1e-06 norm_topk_prob .................................. False normalization ................................... LayerNorm num_attention_heads ............................. 16 num_channels .................................... 3 num_classes ..................................... 1000 num_experts ..................................... None num_layer_list .................................. num_layers ...................................... 24 num_layers_per_virtual_pipeline_stage ........... None num_query_groups ................................ 16 num_workers ..................................... 8 one_logger_entity ............................... hwinf_dcm one_logger_project .............................. e2e-tracking one_logger_run_name ............................. None onnx_safe ....................................... None openai_gelu ..................................... False optimizer ....................................... adam original_max_position_embeddings ................ None output_bert_embeddings .......................... False output_layer_slice_num .......................... 1 output_logit_softcapping ........................ None output_multiplier_scale ......................... None overlap_grad_reduce ............................. False overlap_p2p_comm ................................ False overlap_param_gather ............................ False override_opt_param_scheduler .................... False pad_to_multiple_of .............................. 8 padded_vocab_size ............................... 152064 params_dtype .................................... torch.bfloat16 patch_dim ....................................... 14 perform_initialization .......................... True pipeline_model_parallel_size .................... 1 pipeline_model_parallel_split_rank .............. None position_embedding_type ......................... rope post_norm ....................................... False pre_tockens ..................................... 65536 pref_ftx ........................................ 0.0 pretrained_checkpoint ........................... None profile ......................................... False profile_level ................................... level0 profile_ranks ................................... [-1] profile_record_shapes ........................... False profile_save_path ............................... ./profile_dir profile_step_end ................................ 12 profile_step_start .............................. 10 profile_with_cpu ................................ False profile_with_memory ............................. False profile_with_stack .............................. False prompt_format ................................... qwen2 prompt_type ..................................... None q_lora_rank ..................................... None qk_layernorm .................................... False qk_nope_head_dim ................................ None qk_rope_head_dim ................................ None query_in_block_prob ............................. 0.1 query_pre_attn_scalar ........................... None rampup_batch_size ............................... None rank ............................................ 0 recompute_activation_function ................... False recompute_activation_function_num_layers ........ None recompute_granularity ........................... None recompute_in_advance ............................ False recompute_in_bubble ............................. False recompute_method ................................ None recompute_num_layers ............................ None reduce_recompute_for_last_chunk ................. False ref_model ....................................... None reset_attention_mask ............................ False reset_position_ids .............................. False retriever_report_topk_accuracies ................ [] retriever_score_scaling ......................... False retriever_seq_length ............................ 256 retro_add_retriever ............................. False retro_attention_gate ............................ 1 retro_cyclic_train_iters ........................ None retro_encoder_attention_dropout ................. 0.1 retro_encoder_hidden_dropout .................... 0.1 retro_encoder_layers ............................ 2 retro_num_neighbors ............................. 2 retro_num_retrieved_chunks ...................... 2 retro_project_dir ............................... None retro_verify_neighbor_count ..................... True reuse_fp32_param ................................ False rope_scaling_beta_fast .......................... 32 rope_scaling_beta_slow .......................... 1 rope_scaling_factor ............................. 1.0 rope_scaling_mscale ............................. 1.0 rope_scaling_mscale_all_dim ..................... 0.0 rope_scaling_original_max_position_embeddings ... None rope_scaling_type ............................... None rotary_base ..................................... 1000000.0 rotary_interleaved .............................. False rotary_percent .................................. 1.0 rotary_seq_len_interpolation_factor ............. None routed_scaling_factor ........................... None sample_rate ..................................... 1.0 save ............................................ /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// save_interval ................................... 100 scale_depth ..................................... None scale_emb ....................................... None scatter_gather_tensors_in_pipeline .............. True seed ............................................ 424242 seq_aux ......................................... False seq_length ...................................... 1025 sequence_parallel ............................... False sgd_momentum .................................... 0.9 shape_order ..................................... SBH shared_expert_gate .............................. False shared_expert_gate_output_dimension ............. 1 short_seq_prob .................................. 0.1 skip_bias_add ................................... True skip_train ...................................... False sliding_window .................................. None sparse_mode ..................................... 4 spec ............................................ None split ........................................... 100,0,0 square_alibi_mask ............................... False squared_relu .................................... False stage ........................................... None standalone_embedding_stage ...................... False start_weight_decay .............................. 0.0 swap_attention .................................. False swap_modules .................................... None swiglu .......................................... False swin_backbone_type .............................. tiny tensor_model_parallel_size ...................... 8 tensorboard_dir ................................. None tensorboard_log_interval ........................ 1 tensorboard_queue_size .......................... 1000 test_data_path .................................. None test_mode ....................................... False timing_log_level ................................ 0 timing_log_option ............................... minmax titles_data_path ................................ None tokenizer_kwargs ................................ None tokenizer_model ................................. None tokenizer_name_or_path .......................... /data_4/models/Qwen/Qwen2.5-14B-Instruct/ tokenizer_not_use_fast .......................... True tokenizer_padding_side .......................... right tokenizer_type .................................. PretrainedFromHF topk_group ...................................... None tp_2d ........................................... False tp_comm_bulk_dgrad .............................. True tp_comm_bulk_wgrad .............................. True tp_comm_overlap ................................. False tp_comm_overlap_ag .............................. True tp_comm_overlap_cfg ............................. None tp_comm_overlap_rs .............................. True tp_comm_split_ag ................................ True tp_comm_split_rs ................................ True tp_x ............................................ 1 tp_y ............................................ 1 train_data_path ................................. None train_iters ..................................... 1000 train_samples ................................... None transformer_impl ................................ local transformer_pipeline_model_parallel_size ........ 1 ulysses_degree_in_cp ............................ None untie_embeddings_and_output_weights ............. True use_checkpoint_args ............................. False use_checkpoint_opt_param_scheduler .............. False use_cp_send_recv_overlap ........................ True use_cpu_initialization .......................... None use_deter_comp .................................. False use_dist_ckpt ................................... False use_distributed_optimizer ....................... True use_flash_attn .................................. True use_fused_moe_token_permute_and_unpermute ....... False use_fused_ring_attention_update ................. False use_fused_rmsnorm ............................... True use_fused_rotary_pos_emb ........................ True use_fused_swiglu ................................ True use_glm_rope .................................... False use_mc2 ......................................... False use_mcore_models ................................ True use_one_sent_docs ............................... False use_ring_exchange_p2p ........................... False use_rotary_position_embeddings .................. True v_head_dim ...................................... None valid_data_path ................................. None variable_seq_lengths ............................ False virtual_pipeline_model_parallel_size ............ None vision_backbone_type ............................ vit vision_context_parallel ......................... False vision_downsample_ratio ......................... 0.5 vision_downsample_stride ........................ 1.0 vision_model_freeze ............................. True vision_model_lr_decay_rate ...................... 1.0 vision_model_lr_mult ............................ 1.0 vision_model_recompute .......................... False vision_model_type ............................... intern_300m vision_normalize_type ........................... imagenet vision_pretraining .............................. False vision_pretraining_type ......................... classify vision_process_type ............................. dynamic vision_projector_freeze ......................... False vision_projector_pre_norm ....................... True vision_projector_recompute ...................... False vision_projector_type ........................... mlp vision_seq_length ............................... 1025 vit_load ........................................ / vocab_extra_ids ................................. 0 vocab_file ...................................... None vocab_size ...................................... None wandb_exp_name .................................. wandb_project ................................... wandb_save_dir .................................. weight_decay .................................... 0.0 weight_decay_incr_style ......................... constant world_size ...................................... 512 yaml_cfg ........................................ None -------------------- end of vit_args --------------------- MegatronVisionModel vision_model_config VisionTransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=24, first_pipeline_num_layers=0, independent_parallel=True, hidden_size=1024, num_attention_heads=16, num_query_groups=16, ffn_hidden_size=4096, kv_channels=64, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=True, add_qkv_bias=True, gated_linear_unit=False, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='LayerNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb23001b3a0>, output_layer_init_method=.init_ at 0x7fb23001be50>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=False, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=1025, rotary_percent=1.0, img_w=448, img_h=448, patch_dim=14) MegatronVisionModel vision_model_layer_spec ModuleSpec(module=, params={}, submodules=TransformerLayerSubmodules(input_layernorm=, self_attention=ModuleSpec(module=, params={'attn_mask_type': }, submodules=SelfAttentionSubmodules(linear_qkv=, core_attention=, linear_proj=, q_layernorm=None, k_layernorm=None)), post_attn_norm=, self_attn_bda=, pre_cross_attn_layernorm=, cross_attention=, cross_attn_bda=, pre_mlp_layernorm=, mlp=ModuleSpec(module=, params={}, submodules=MLPSubmodules(linear_fc1=, linear_fc2=)), post_mlp_layernorm=, mlp_bda=, sharded_state_dict_keys_map={})) WARNING: could not find the metadata file /latest_checkpointed_iteration.txt will not load any checkpoints and will start from random vision_projector_config TransformerConfig(tensor_model_parallel_size=8, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=2, expert_model_parallel_size=1, perform_initialization=True, use_cpu_initialization=None, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, pipeline_dtype=torch.bfloat16, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=True, defer_embedding_wgrad_compute=False, pipeline_model_parallel_split_rank=None, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=48, first_pipeline_num_layers=0, independent_parallel=False, hidden_size=5120, num_attention_heads=40, num_query_groups=8, ffn_hidden_size=1024, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=True, gated_linear_unit=False, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=None, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=False, test_mode=False, init_method=.init_ at 0x7fb23001bee0>, output_layer_init_method=.init_ at 0x7fb23001bf70>, init_method_std=0.01, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, bias_activation_fusion=False, masked_softmax_fusion=False, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=True, recompute_granularity='full', recompute_method='block', recompute_num_layers=48, distribute_saved_activations=False, fp8=None, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, moe_router_load_balancing_type='aux_loss', moe_router_topk=2, moe_grouped_gemm=False, moe_aux_loss_coeff=0.0, moe_z_loss_coeff=0.0, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='allgather', moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, enable_cuda_graph=False, max_position_embeddings=131072, rotary_percent=1.0) vision_projector_layer_spec MLPSubmodules(linear_fc1=, linear_fc2=) vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. vision_model_freeze => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. vision_model_freeze => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. vision_model_freeze => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216 => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216 => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 1887497216=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216 > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216 model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 1887497216 > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216 vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 1887497216 => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 1887497216 => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 1887497216 model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1887497216 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 1887497216 vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 1887497216 INFO:megatron.core.distributed.param_and_grad_buffer:Number of buckets for gradient all-reduce / reduce-scatter: 39 INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 1 (97320960 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.output_layer.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 2 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.final_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.47.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 3 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.46.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 4 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.45.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 5 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.44.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.43.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 6 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.42.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 7 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.41.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 8 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.40.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.39.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 9 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.38.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 10 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.37.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 11 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.36.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.35.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 12 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.34.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 13 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.33.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 14 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.32.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.31.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 15 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.30.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 16 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.29.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 17 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.28.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.27.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 18 (52112256 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.26.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.mlp.linear_fc2.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 19 (42287872 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.mlp.linear_fc2.weight _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.self_attention.linear_proj.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.25.input_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.pre_mlp_layernorm.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.self_attention.linear_qkv.bias _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 20 (43270016 elements): INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.24.input_layernorm.weight _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.pre_mlp_layernorm.weight _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_qkv.bias INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.mlp.linear_fc2.weight _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.mlp.linear_fc1.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.23.input_layernorm.weight _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 21 (52112256 elements): _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.input_layernorm.weight _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.22.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.mlp.linear_fc1.weight_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 22 (42287872 elements): _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.input_layernorm.weight_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_proj.weight _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.21.self_attention.linear_qkv.weight INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 23 (43270016 elements): _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.20.input_layernorm.weight _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.19.input_layernorm.weight _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 24 (52112256 elements):_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.input_layernorm.weight _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.18.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 25 (42287872 elements): _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.input_layernorm.weight _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.17.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 26 (43270016 elements): _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.16.input_layernorm.weight _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.15.input_layernorm.weight _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 27 (52112256 elements): _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.input_layernorm.weight _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.14.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 28 (42287872 elements):_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.mlp.linear_fc1.weight _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_qkv.weight _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.13.input_layernorm.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 29 (43270016 elements): _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.12.input_layernorm.weight _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.mlp.linear_fc2.weight _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_qkv.weight _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.11.input_layernorm.weight _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 30 (52112256 elements): _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.mlp.linear_fc1.weight _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_qkv.weight _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.input_layernorm.weight _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.self_attention.linear_qkv.bias_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.10.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 31 (42287872 elements):_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.9.input_layernorm.weight_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 32 (43270016 elements): _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.8.input_layernorm.weight _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.input_layernorm.weight _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.7.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 33 (52112256 elements): _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.6.input_layernorm.weight _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 34 (42287872 elements):_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.input_layernorm.weight_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.5.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 35 (43270016 elements): _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.mlp.linear_fc1.weight_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.input_layernorm.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.4.input_layernorm.weight _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.3.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.mlp.linear_fc2.weight _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 36 (52112256 elements):_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.mlp.linear_fc2.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_proj.weight_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.2.input_layernorm.weight _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 37 (42287872 elements): _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_qkv.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.input_layernorm.weight_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.pre_mlp_layernorm.weight _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_qkv.bias_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.pre_mlp_layernorm.weight_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.1.self_attention.linear_qkv.bias _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.mlp.linear_fc2.weight_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.mlp.linear_fc1.weight _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_qkv.weight_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.self_attention.linear_proj.weight _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 38 (97326080 elements): _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.decoder.layers.0.input_layernorm.weight _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.embedding.word_embeddings.weight _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer:Params for bucket 39 (1187840 elements): _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.pre_proj_layernorm.bias_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.pre_proj_layernorm.weight _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.vision_projection.encoder.linear_fc1.weight _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)INFO:megatron.core.distributed.param_and_grad_buffer: module.external_feature_model.vision_projection.encoder.linear_fc2.weight _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:megatron.core.optimizer:Setting up optimizer with OptimizerConfig(optimizer='adam', lr=5e-06, min_lr=1e-07, decoupled_lr=None, decoupled_min_lr=None, weight_decay=0.0, fp16=False, bf16=True, params_dtype=torch.bfloat16, loss_scale=None, initial_loss_scale=4096.0, min_loss_scale=1.0, loss_scale_window=1000, hysteresis=2, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, sgd_momentum=0.9, use_distributed_optimizer=True, overlap_grad_reduce=True, overlap_param_gather=False, clip_grad=1.0, log_num_zeros_in_grad=False, barrier_with_L1_time=True, timers=)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) INFO:lcvlm_modellink.core.optimizer:_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) > learning rate decay style: cosine _load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration 5000 500050005000_load_base_checkpoint iteration _load_base_checkpoint iteration 5000_load_base_checkpoint iteration 50005000 5000_load_base_checkpoint release 5000 500050005000 5000 5000 _load_base_checkpoint release _load_base_checkpoint release _load_base_checkpoint release _load_base_checkpoint release_load_base_checkpoint release _load_base_checkpoint release _load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release _load_base_checkpoint release 5000_load_base_checkpoint releaseFalse 5000 _load_base_checkpoint release FalseFalseFalse False_load_base_checkpoint release FalseFalse False False FalseFalse _load_base_checkpoint releaseFalse _load_base_checkpoint releaseFalse False False _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt _load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt _load_base_checkpoint_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt loading checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ at iteration 5000 _load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False could not find arguments in the checkpoint ... strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True checkpoint version 3.0 successfully loaded checkpoint from /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ [ t 0, p 0 ] at iteration 0 [after model, optimizer, and learning rate scheduler are built] datetime: 2024-11-27 12:45:29 > building train, validation, and test datasets ... > datasets target sizes (minimum size): train: 64000 validation: 0 test: 0 INFO:megatron.core.datasets.blended_megatron_dataset_config:mock = False INFO:megatron.core.datasets.blended_megatron_dataset_config:Let split_matrix = [(0, 1.0), None, None] > building train, validation, and test datasets for GPT ... > rank 12 does not create GPT datasets ... > rank 9 does not create GPT datasets ...> rank 5 does not create GPT datasets ... > rank 6 does not create GPT datasets ... > rank 14 does not create GPT datasets ... > rank 2 does not create GPT datasets ... > rank 3 does not create GPT datasets ... > rank 15 does not create GPT datasets ... > rank 4 does not create GPT datasets ...> rank 10 does not create GPT datasets ... > rank 11 does not create GPT datasets ... > rank 7 does not create GPT datasets ... > rank 13 does not create GPT datasets ... > rank 8 is creating GPT datasets ... tokenizer Qwen2TokenizerFast(name_or_path='/data_4/models/Qwen/Qwen2.5-14B-Instruct/', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151657: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151658: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False), 151665: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151666: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151667: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151668: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151669: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151670: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151671: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151672: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151673: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151674: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151675: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151676: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151677: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151678: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151679: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151680: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 151681: AddedToken("