Update modifed code

Browse files

Files changed (12) hide show

data/convsersation.py +138 -6
data/dataset_for_clean_descrip.py +5 -1
data/pose_hicodet.py +7 -0
scripts/examine_hico.sh +20 -0
scripts/pipeline_hico.sh +68 -0
scripts/refine_hico.sh +20 -0
tools/annotate_hico.py +77 -60
tools/clean_initial_annotation.py +61 -52
tools/examine_hico.py +179 -0
tools/merge_json_outputs.py +39 -0
tools/refine_hico.py +165 -0
tools/vlm_backend.py +163 -0

data/convsersation.py CHANGED Viewed

@@ -37,23 +37,27 @@ class Conversation:
         super().__init__()
         if system == '':
            self.system = f"""
-           You are an AI assistant. You will be given an image that contains a main human subject.
            Task:
-           Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.
            Hints:
-           You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.
            Required Constraints:
            - Start with ONE sentence that summarizes the main action in natural language.
            - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
            - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
            - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
-           - Write your description in clear, concise sentences grounded in visible evidence.
            Optional Constraints :
            - Write naturally. Avoid repeating the same sentence pattern.
-           - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
            """
         else:
             self.system = system
@@ -284,6 +288,134 @@ class Conversation_For_Clean_Evidence:
         """
         return prompt
 @dataclasses.dataclass
 class Conversation_For_Action_Pharse:
     def __init__(self, system='', data_path=''):
@@ -416,4 +548,4 @@ class Conversation_For_COCO_Long_Description:
 if __name__ == "__main__":
-   pass

         super().__init__()
         if system == '':
            self.system = f"""
+           You are an AI assistant for first-pass long-form HICO annotation. You will be given an image that contains a main human subject.
            Task:
+           Write a detailed long description of the visual evidence in the image that supports the subject's action, with an emphasis on human body parts, posture, spatial configuration, and interactions with objects.
            Hints:
+           You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You should use these hints as anchors, and you may add other relevant visible evidence you observe.
            Required Constraints:
            - Start with ONE sentence that summarizes the main action in natural language.
            - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
            - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
            - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
+           - The description must be long and detailed enough to serve as a first-pass annotation for later refinement.
+           - Include as many relevant supporting details as are visibly justified, especially about contact, pose, orientation, support, and object interaction.
+           - Write your description in clear, natural sentences grounded in visible evidence.
            Optional Constraints :
+           - Prefer a rich multi-sentence paragraph rather than a short caption.
+           - Cover multiple cues when available, such as limb placement, body balance, joint bending, contact points, and relative position to the object.
            - Write naturally. Avoid repeating the same sentence pattern.
+           - If both sides contribute differently, describe them separately.
            """
         else:
             self.system = system
         """
         return prompt
+@dataclasses.dataclass
+class Conversation_examiner:
+    def __init__(self, system='', data_path=''):
+        super().__init__()
+        if system == '':
+           self.system = f"""
+           You are a strict checker and final editor for HICO action annotations.
+           You will be given:
+           - The ground-truth HICO action hint as [VERB, OBJECT].
+           - Part-state hints derived from annotation labels.
+           - One or more candidate texts, such as a long description, a short description, or an evidence-only description.
+           Your task:
+           - Judge whether the candidate texts are consistent with the target action.
+           - Check whether the descriptions are grounded in plausible visible body-part evidence.
+           - Check whether any mentioned body parts use valid COCO keypoint names only: {COCO_KEYPOINT_NAME}.
+           - Detect unsupported claims, contradictions, object/action mismatches, left/right mistakes, and hallucinated joints or interactions.
+           - Produce a final checked description after resolving any issues you can fix from the provided candidates and hints.
+           Important checking rules:
+           - The target action is defined by the provided HICO hint, not by the candidate text.
+           - If a candidate text conflicts with the target action, fix the final checked description so it aligns with the target action.
+           - If a candidate text includes body-part terms outside the allowed keypoint list, replace them with valid names when possible and record the issue.
+           - If evidence is too vague, missing, or unrelated to the target action, remove unsupported content from the final checked description and record the issue.
+           - Pay special attention to left/right consistency. If the candidate confuses left and right, or assigns evidence to the wrong side, correct it when the correct side is supported by the provided candidates and hints; otherwise remove the uncertain side-specific claim and record the issue.
+           - Do not keep any joint claim that is not visible, not inferable from the provided evidence, or appears hallucinated. If a joint or body-part interaction cannot be supported, remove it and record the issue.
+           - Do not invent new visual evidence that is not supported by the provided candidates and hints.
+           - The final checked description should be concise, natural, and reliable.
+           - Prefer the strongest grounded evidence among the provided candidates.
+           - When side-specific evidence is uncertain, prefer a conservative description over a risky one.
+           Output format:
+           Return plain text in exactly this structure.
+           Verdict: PASS or REVISED
+           Action alignment: one short sentence
+           Evidence grounding: one short sentence
+           Keypoint-name validity: one short sentence
+           Checked description:
+           <final checked description>
+           Issues:
+           - item 1
+           - item 2
+           If there are no issues, write:
+           Issues:
+           - None
+           """
+        else:
+            self.system = system
+        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
+        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
+    def _replace_part_names(self, text):
+        REPL = {
+                "hand": "wrist",
+                "hands": "wrists",
+                "foot": "ankle",
+                "feet": "ankles",
+            }
+        pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
+        def _sub(m):
+            w = m.group(0)
+            out = REPL[w.lower()]
+            if w[0].isupper():
+                out = out.capitalize()
+            return out
+        return pattern.sub(_sub, text)
+    def _humanpart2word(self, action_labels):
+        action_labels_in_words = []
+        part_state_keys = list(self.part_state_reference.keys())
+        for d in action_labels:
+            human_part_id = d['human_part']
+            part_state_id = d['partstate']
+            part_name = PART_ORDER[human_part_id]
+            for key in part_state_keys:
+                if key in part_name:
+                    states = self.part_state_reference[key]
+                    part_state = states[part_state_id]
+            part_name = self._replace_part_names(part_name)
+            action_labels_in_words.append([part_name, part_state])
+        return action_labels_in_words
+    def _actionid2word(self, hoi_id):
+        obj, act = self.hoi_reference[hoi_id]
+        return obj, act
+    def get_prompt(self, meta):
+        hoi_id = meta['hoi_id']
+        obj_in_word, act_in_word = self._actionid2word(hoi_id)
+        action_labels = meta['action_labels']
+        action_labels_in_words = self._humanpart2word(action_labels)
+        long_description = self._replace_part_names(meta.get('description', ''))
+        refined_description = self._replace_part_names(meta.get('refined_description', ''))
+        short_description = self._replace_part_names(meta.get('short_description', ''))
+        action_description = self._replace_part_names(meta.get('action_description', ''))
+        evidence_description = self._replace_part_names(meta.get('evidence_description', ''))
+        prompt = f"""
+               Target action hint: [{act_in_word}, {obj_in_word}]
+               Part-state hints:
+               {action_labels_in_words}
+               Candidate long description:
+               {long_description if long_description else '[Missing]'}
+               Candidate refined description:
+               {refined_description if refined_description else '[Missing]'}
+               Candidate short description:
+               {short_description if short_description else '[Missing]'}
+               Candidate action description:
+               {action_description if action_description else '[Missing]'}
+               Candidate evidence description:
+               {evidence_description if evidence_description else '[Missing]'}
+               Check the candidates against the target action and part-state hints, produce the final checked description, and then follow the required output format exactly.
+        """
+        return prompt
 @dataclasses.dataclass
 class Conversation_For_Action_Pharse:
     def __init__(self, system='', data_path=''):
 if __name__ == "__main__":
+   pass

data/dataset_for_clean_descrip.py CHANGED Viewed

@@ -34,6 +34,8 @@ class PoseHICODetDataset(Dataset):
     """Dataset for supervised fine-tuning."""
     def __init__(self, data_path: str,
                  multimodal_cfg: dict,
                  ):
         super(PoseHICODetDataset, self).__init__()
         logging.warning("Loading data...")
@@ -43,7 +45,9 @@ class PoseHICODetDataset(Dataset):
         self.pixel_std = 200
         self.num_joints = 17
         self.num_joints_full_body = 136
-        self.list_data_dict = self._load_json('./outputs/merged_labels.json')
         json_path = os.path.join(data_path, "Annotation/hico-det-instance-level/hico-det-training-set-instance-level.json")
         with open(json_path, "r", encoding="utf-8") as f:

     """Dataset for supervised fine-tuning."""
     def __init__(self, data_path: str,
                  multimodal_cfg: dict,
+                 annotation_path: str = './outputs/merged_labels.json',
+                 max_samples: int = 0,
                  ):
         super(PoseHICODetDataset, self).__init__()
         logging.warning("Loading data...")
         self.pixel_std = 200
         self.num_joints = 17
         self.num_joints_full_body = 136
+        self.list_data_dict = self._load_json(annotation_path)
+        if max_samples > 0:
+            self.list_data_dict = self.list_data_dict[:max_samples]
         json_path = os.path.join(data_path, "Annotation/hico-det-instance-level/hico-det-training-set-instance-level.json")
         with open(json_path, "r", encoding="utf-8") as f:

data/pose_hicodet.py CHANGED Viewed

@@ -34,6 +34,7 @@ class PoseHICODetDataset(Dataset):
     """Dataset for supervised fine-tuning."""
     def __init__(self, data_path: str,
                  multimodal_cfg: dict,
                  ):
         super(PoseHICODetDataset, self).__init__()
         logging.warning("Loading data...")
@@ -43,6 +44,7 @@ class PoseHICODetDataset(Dataset):
         self.pixel_std = 200
         self.num_joints = 17
         self.num_joints_full_body = 136
         self.list_data_dict = self._load_data(data_path)
@@ -134,6 +136,11 @@ class PoseHICODetDataset(Dataset):
                     'hoi_obj': hoi_obj,
                 })
                 instance_id += 1
         logging.warning("The number of training samples is {}".format(len(list_data_dict)))
         logging.warning("Formatting inputs...Skip in lazy mode")

     """Dataset for supervised fine-tuning."""
     def __init__(self, data_path: str,
                  multimodal_cfg: dict,
+                 max_samples: int = 0,
                  ):
         super(PoseHICODetDataset, self).__init__()
         logging.warning("Loading data...")
         self.pixel_std = 200
         self.num_joints = 17
         self.num_joints_full_body = 136
+        self.max_samples = max_samples
         self.list_data_dict = self._load_data(data_path)
                     'hoi_obj': hoi_obj,
                 })
                 instance_id += 1
+                if self.max_samples > 0 and len(list_data_dict) >= self.max_samples:
+                    logging.warning("Reached max_samples={}, stopping early.".format(self.max_samples))
+                    logging.warning("The number of training samples is {}".format(len(list_data_dict)))
+                    logging.warning("Formatting inputs...Skip in lazy mode")
+                    return list_data_dict
         logging.warning("The number of training samples is {}".format(len(list_data_dict)))
         logging.warning("Formatting inputs...Skip in lazy mode")

scripts/examine_hico.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+IDX=1
+export PYTHONPATH=$PYTHONPATH:./
+data_path=../datasets/HICO-Det
+model_path=./model_weights/qwen3_8b_vl_instruct
+annotation_path=./outputs/merged_labels.json
+output_dir=outputs/examiner
+if [ -d ${output_dir} ]; then
+    echo "dir already exists"
+else
+    mkdir -p ${output_dir}
+fi
+CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=1 --master_port=25007 \
+    tools/examine_hico.py \
+    --model-path ${model_path} \
+    --data-path ${data_path} \
+    --annotation-path ${annotation_path} \
+    --output-dir ${output_dir} \

scripts/pipeline_hico.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env bash
+set -euo pipefail
+export PYTHONPATH="${PYTHONPATH:-}:./"
+DATA_PATH=../datasets/HICO-Det
+LONG_MODEL_PATH=./model_weights/qwen3_8b_vl_instruct
+REFINE_MODEL_PATH=./model_weights/qwen3_8b_vl_instruct
+EXAMINE_MODEL_PATH=./model_weights/qwen3_8b_vl_instruct
+LONG_GPU_IDS=0
+REFINE_GPU_IDS=0
+EXAMINE_GPU_IDS=0
+LONG_NPROC=1
+REFINE_NPROC=1
+EXAMINE_NPROC=1
+LONG_OUT_DIR=outputs/pipeline/long
+REFINE_OUT_DIR=outputs/pipeline/refine
+EXAMINE_OUT_DIR=outputs/pipeline/examine
+MERGED_LONG_JSON=outputs/pipeline/merged_long.json
+MERGED_REFINE_JSON=outputs/pipeline/merged_refine.json
+MERGED_EXAMINE_JSON=outputs/pipeline/merged_examine.json
+mkdir -p "${LONG_OUT_DIR}" "${REFINE_OUT_DIR}" "${EXAMINE_OUT_DIR}"
+CUDA_VISIBLE_DEVICES=${LONG_GPU_IDS} OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=${LONG_NPROC} --master_port=25011 \
+    tools/annotate_hico.py \
+    --model-path "${LONG_MODEL_PATH}" \
+    --data-path "${DATA_PATH}" \
+    --output-dir "${LONG_OUT_DIR}"
+python3 tools/merge_json_outputs.py \
+    --input-dir "${LONG_OUT_DIR}" \
+    --pattern "labels_*.json" \
+    --output-path "${MERGED_LONG_JSON}"
+CUDA_VISIBLE_DEVICES=${REFINE_GPU_IDS} OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=${REFINE_NPROC} --master_port=25012 \
+    tools/refine_hico.py \
+    --model-path "${REFINE_MODEL_PATH}" \
+    --data-path "${DATA_PATH}" \
+    --annotation-path "${MERGED_LONG_JSON}" \
+    --output-dir "${REFINE_OUT_DIR}"
+python3 tools/merge_json_outputs.py \
+    --input-dir "${REFINE_OUT_DIR}" \
+    --pattern "refine_labels_*.json" \
+    --output-path "${MERGED_REFINE_JSON}"
+CUDA_VISIBLE_DEVICES=${EXAMINE_GPU_IDS} OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=${EXAMINE_NPROC} --master_port=25013 \
+    tools/examine_hico.py \
+    --model-path "${EXAMINE_MODEL_PATH}" \
+    --data-path "${DATA_PATH}" \
+    --annotation-path "${MERGED_REFINE_JSON}" \
+    --output-dir "${EXAMINE_OUT_DIR}"
+python3 tools/merge_json_outputs.py \
+    --input-dir "${EXAMINE_OUT_DIR}" \
+    --pattern "examiner_labels_*.json" \
+    --output-path "${MERGED_EXAMINE_JSON}"
+echo "Pipeline complete."
+echo "Long descriptions: ${MERGED_LONG_JSON}"
+echo "Refined descriptions: ${MERGED_REFINE_JSON}"
+echo "Examiner results: ${MERGED_EXAMINE_JSON}"

scripts/refine_hico.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+IDX=1
+export PYTHONPATH=$PYTHONPATH:./
+data_path=../datasets/HICO-Det
+model_path=./model_weights/qwen3_8b_vl_instruct
+annotation_path=./outputs/merged_labels.json
+output_dir=outputs/refine
+if [ -d ${output_dir} ]; then
+    echo "dir already exists"
+else
+    mkdir -p ${output_dir}
+fi
+CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=1 --master_port=25008 \
+    tools/refine_hico.py \
+    --model-path ${model_path} \
+    --data-path ${data_path} \
+    --annotation-path ${annotation_path} \
+    --output-dir ${output_dir} \

tools/annotate_hico.py CHANGED Viewed

@@ -15,8 +15,7 @@ from data.convsersation import Conversation
 import re
 from dataclasses import dataclass
-from transformers import Qwen3VLForConditionalGeneration
-from transformers import AutoTokenizer, AutoConfig, AutoProcessor
 def disable_torch_init():
     """
@@ -29,6 +28,30 @@ import os, json
 import torch
 import torch.distributed as dist
 def gather_labels_and_save(labels, output_path):
     # Make sure dist is initialized (torchrun / deepspeed / accelerate usually does this)
     world_size = dist.get_world_size()
@@ -85,11 +108,11 @@ class DataCollatorForSupervisedDataset(object):
                                                  tokenize=False,
                                                  add_generation_prompt=True)
                    for m in messages]
-        batch_tensors = self.processor(
-            text=prompts,
             images=batch_images,
-            return_tensors="pt",
-            padding=True
         )
         return batch_tensors, result_meta
@@ -104,48 +127,42 @@ def worker(model, processor, dataset, args, output_dir):
     sub_dataset = torch.utils.data.Subset(dataset, indices)
     batch_size = 1
     data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
-    labels = []
-    for batch_tensors, result_meta in tqdm(data_loader):
-        input_ids = batch_tensors['input_ids'].cuda()
-        batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
-        with torch.inference_mode():
-            output_dict = model.generate(do_sample=False,
-                                         output_scores=True,
-                                         return_dict_in_generate=True,
-                                         max_new_tokens=1600,
-                                         output_logits=True,
-                                         **batch_tensors,)
-            output_ids = output_dict['sequences']
-        for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
-            input_token_len = input_id.shape[0]
-            n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
-            if n_diff_input_output > 0:
-                print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
-            output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
-            labels.append({
-                'file_name': meta['file_name'],
-                'image_id': meta['image_id'],
-                'instance_id': meta['instance_id'],
-                'keypoints': meta['joints_3d'].reshape(-1).tolist(),
-                'vis': meta['joints_3d_vis'].reshape(-1).tolist(),
-                'im_height': meta['hoi_obj']['height'],
-                'im_width': meta['hoi_obj']['width'],
-                'hoi_id': meta['hoi_obj']['hoi_id'],
-                'human_bbox': meta['hoi_obj']['human_bbox'],
-                'object_bbox': meta['hoi_obj']['object_bbox'],
-                'action_labels': meta['hoi_obj']['action_labels'],
-                'description': output,
-            })
-    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
-    output_path = os.path.join(args.output_dir, f'labels_{local_rank}.json')
-    with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(labels, f, ensure_ascii=False, indent=2)
 def eval_model(args):
     torch.distributed.init_process_group(backend='nccl')
@@ -156,25 +173,22 @@ def eval_model(args):
     torch.cuda.set_device(rank)
     disable_torch_init()
-    model = Qwen3VLForConditionalGeneration.from_pretrained(
-        args.model_path,
-        torch_dtype=torch.bfloat16,
-        trust_remote_code=True
     )
     model = model.cuda()
     model.eval()
-    processor = AutoProcessor.from_pretrained(
-            args.model_path,
-            trust_remote_code=True)
-    processor.tokenizer.padding_side = "left"
-    processor.tokenizer.pad_token = processor.tokenizer.eos_token
     dataset = PoseHICODetDataset(
                 data_path=args.data_path,
                 multimodal_cfg=dict(image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
                         data_augmentation=False,
-                        image_size=336,),)
     worker(model, processor, dataset, args, args.output_dir)
 if __name__ == "__main__":
@@ -182,7 +196,10 @@ if __name__ == "__main__":
     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
     parser.add_argument("--data-path", type=str, default="")
     parser.add_argument("--output-dir", type=str, default="")
     args = parser.parse_args()
     eval_model(args)

 import re
 from dataclasses import dataclass
+from tools.vlm_backend import build_batch_tensors, decode_generated_text, load_model_and_processor
 def disable_torch_init():
     """
 import torch
 import torch.distributed as dist
+class StreamingJsonArrayWriter:
+    def __init__(self, output_path):
+        self.output_path = output_path
+        self.file = None
+        self.is_first = True
+    def __enter__(self):
+        self.file = open(self.output_path, "w", encoding="utf-8")
+        self.file.write("[\n")
+        self.file.flush()
+        return self
+    def write(self, item):
+        if not self.is_first:
+            self.file.write(",\n")
+        json.dump(item, self.file, ensure_ascii=False, indent=2)
+        self.file.flush()
+        self.is_first = False
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.file is not None:
+            self.file.write("\n]\n")
+            self.file.close()
 def gather_labels_and_save(labels, output_path):
     # Make sure dist is initialized (torchrun / deepspeed / accelerate usually does this)
     world_size = dist.get_world_size()
                                                  tokenize=False,
                                                  add_generation_prompt=True)
                    for m in messages]
+        batch_tensors = build_batch_tensors(
+            processor=self.processor,
+            prompts=batch_prompts,
             images=batch_images,
+            system_prompt=self.conv.system,
         )
         return batch_tensors, result_meta
     sub_dataset = torch.utils.data.Subset(dataset, indices)
     batch_size = 1
     data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
+    output_path = os.path.join(args.output_dir, f'labels_{rank}.json')
+    with StreamingJsonArrayWriter(output_path) as writer:
+        for batch_tensors, result_meta in tqdm(data_loader):
+            input_ids = batch_tensors['input_ids'].cuda()
+            batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
+            with torch.inference_mode():
+                output_dict = model.generate(do_sample=False,
+                                             output_scores=True,
+                                             return_dict_in_generate=True,
+                                             max_new_tokens=1600,
+                                             output_logits=True,
+                                             **batch_tensors,)
+                output_ids = output_dict['sequences']
+            for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
+                input_token_len = input_id.shape[0]
+                n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
+                output = decode_generated_text(processor, output_id, input_id)
+                writer.write({
+                    'file_name': meta['file_name'],
+                    'image_id': meta['image_id'],
+                    'instance_id': meta['instance_id'],
+                    'keypoints': meta['joints_3d'].reshape(-1).tolist(),
+                    'vis': meta['joints_3d_vis'].reshape(-1).tolist(),
+                    'im_height': meta['hoi_obj']['height'],
+                    'im_width': meta['hoi_obj']['width'],
+                    'hoi_id': meta['hoi_obj']['hoi_id'],
+                    'human_bbox': meta['hoi_obj']['human_bbox'],
+                    'object_bbox': meta['hoi_obj']['object_bbox'],
+                    'action_labels': meta['hoi_obj']['action_labels'],
+                    'description': output,
+                })
 def eval_model(args):
     torch.distributed.init_process_group(backend='nccl')
     torch.cuda.set_device(rank)
     disable_torch_init()
+    backend_name, model, processor = load_model_and_processor(
+        model_path=args.model_path,
+        backend=args.model_backend,
+        torch_dtype=args.torch_dtype,
+        trust_remote_code=True,
     )
+    print(f'Using model backend: {backend_name}')
     model = model.cuda()
     model.eval()
     dataset = PoseHICODetDataset(
                 data_path=args.data_path,
                 multimodal_cfg=dict(image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
                         data_augmentation=False,
+                        image_size=336,),
+                max_samples=args.max_samples,)
     worker(model, processor, dataset, args, args.output_dir)
 if __name__ == "__main__":
     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
     parser.add_argument("--data-path", type=str, default="")
     parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--max-samples", type=int, default=0)
+    parser.add_argument("--model-backend", type=str, default="auto")
+    parser.add_argument("--torch-dtype", type=str, default="bfloat16")
     args = parser.parse_args()
     eval_model(args)

tools/clean_initial_annotation.py CHANGED Viewed

@@ -15,8 +15,7 @@ from data.convsersation import Conversation_For_Action_Pharse as Conversation
 import re
 from dataclasses import dataclass
-from transformers import Qwen3VLForConditionalGeneration
-from transformers import AutoTokenizer, AutoConfig, AutoProcessor
 def disable_torch_init():
     """
@@ -28,6 +27,30 @@ def disable_torch_init():
 import os, json
 import torch
 @dataclass
 class DataCollatorForSupervisedDataset(object):
     def __init__(self, processor, data_path):
@@ -62,15 +85,11 @@ class DataCollatorForSupervisedDataset(object):
                      "text": prompt},]},
                 ])
-        prompts = [self.processor.apply_chat_template(m,
-                                                 tokenize=False,
-                                                 add_generation_prompt=True)
-                   for m in messages]
-        batch_tensors = self.processor(
-            text=prompts,
             images=batch_images,
-            return_tensors="pt",
-            padding=True
         )
         return batch_tensors, result_meta
@@ -85,39 +104,31 @@ def worker(model, processor, dataset, args, output_dir):
     sub_dataset = torch.utils.data.Subset(dataset, indices)
     batch_size = 16
     data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
-    labels = []
-    for batch_tensors, result_meta in tqdm(data_loader):
-        input_ids = batch_tensors['input_ids'].cuda()
-        batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
-        with torch.inference_mode():
-            output_dict = model.generate(do_sample=False,
-                                         output_scores=True,
-                                         return_dict_in_generate=True,
-                                         max_new_tokens=1600,
-                                         output_logits=True,
-                                         **batch_tensors,)
-            output_ids = output_dict['sequences']
-        for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
-            input_token_len = input_id.shape[0]
-            n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
-            if n_diff_input_output > 0:
-                print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
-            #input_text = processor.tokenizer.batch_decode(output_id[:input_token_len].unsqueeze(0), skip_special_tokens=True)[0]
-            output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
-            # print(output)
-            # import pdb;pdb.set_trace()
-            meta['action_description'] = output
-            #import pdb;pdb.set_trace()
-            labels.append(meta)
-    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
-    output_path = os.path.join(args.output_dir, f'labels_{local_rank}.json')
-    with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(labels, f, ensure_ascii=False, indent=2)
 def eval_model(args):
     torch.distributed.init_process_group(backend='nccl')
@@ -128,19 +139,15 @@ def eval_model(args):
     torch.cuda.set_device(rank)
     disable_torch_init()
-    model = Qwen3VLForConditionalGeneration.from_pretrained(
-        args.model_path,
-        torch_dtype=torch.bfloat16,
-        trust_remote_code=True
     )
     model = model.cuda()
     model.eval()
-    processor = AutoProcessor.from_pretrained(
-            args.model_path,
-            trust_remote_code=True)
-    processor.tokenizer.padding_side = "left"
-    processor.tokenizer.pad_token = processor.tokenizer.eos_token
     dataset = PoseHICODetDataset(
                 data_path=args.data_path,
@@ -154,7 +161,9 @@ if __name__ == "__main__":
     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
     parser.add_argument("--data-path", type=str, default="")
     parser.add_argument("--output-dir", type=str, default="")
     args = parser.parse_args()
     eval_model(args)

 import re
 from dataclasses import dataclass
+from tools.vlm_backend import build_batch_tensors, decode_generated_text, load_model_and_processor
 def disable_torch_init():
     """
 import os, json
 import torch
+class StreamingJsonArrayWriter:
+    def __init__(self, output_path):
+        self.output_path = output_path
+        self.file = None
+        self.is_first = True
+    def __enter__(self):
+        self.file = open(self.output_path, "w", encoding="utf-8")
+        self.file.write("[\n")
+        self.file.flush()
+        return self
+    def write(self, item):
+        if not self.is_first:
+            self.file.write(",\n")
+        json.dump(item, self.file, ensure_ascii=False, indent=2)
+        self.file.flush()
+        self.is_first = False
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.file is not None:
+            self.file.write("\n]\n")
+            self.file.close()
 @dataclass
 class DataCollatorForSupervisedDataset(object):
     def __init__(self, processor, data_path):
                      "text": prompt},]},
                 ])
+        batch_tensors = build_batch_tensors(
+            processor=self.processor,
+            prompts=batch_prompts,
             images=batch_images,
+            system_prompt=self.conv.system,
         )
         return batch_tensors, result_meta
     sub_dataset = torch.utils.data.Subset(dataset, indices)
     batch_size = 16
     data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
+    output_path = os.path.join(args.output_dir, f'labels_{rank}.json')
+    with StreamingJsonArrayWriter(output_path) as writer:
+        for batch_tensors, result_meta in tqdm(data_loader):
+            input_ids = batch_tensors['input_ids'].cuda()
+            batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
+            with torch.inference_mode():
+                output_dict = model.generate(do_sample=False,
+                                             output_scores=True,
+                                             return_dict_in_generate=True,
+                                             max_new_tokens=1600,
+                                             output_logits=True,
+                                             **batch_tensors,)
+                output_ids = output_dict['sequences']
+            for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
+                input_token_len = input_id.shape[0]
+                n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
+                output = decode_generated_text(processor, output_id, input_id)
+                meta['action_description'] = output
+                writer.write(meta)
 def eval_model(args):
     torch.distributed.init_process_group(backend='nccl')
     torch.cuda.set_device(rank)
     disable_torch_init()
+    backend_name, model, processor = load_model_and_processor(
+        model_path=args.model_path,
+        backend=args.model_backend,
+        torch_dtype=args.torch_dtype,
+        trust_remote_code=True,
     )
+    print(f'Using model backend: {backend_name}')
     model = model.cuda()
     model.eval()
     dataset = PoseHICODetDataset(
                 data_path=args.data_path,
     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
     parser.add_argument("--data-path", type=str, default="")
     parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--model-backend", type=str, default="auto")
+    parser.add_argument("--torch-dtype", type=str, default="bfloat16")
     args = parser.parse_args()
     eval_model(args)

tools/examine_hico.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import json
+import argparse
+import re
+from tqdm import tqdm
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from data.dataset_for_clean_descrip import PoseHICODetDataset
+from data.convsersation import Conversation_examiner as Conversation
+from dataclasses import dataclass
+from tools.vlm_backend import build_batch_tensors, decode_generated_text, load_model_and_processor
+class StreamingJsonArrayWriter:
+    def __init__(self, output_path):
+        self.output_path = output_path
+        self.file = None
+        self.is_first = True
+    def __enter__(self):
+        self.file = open(self.output_path, "w", encoding="utf-8")
+        self.file.write("[\n")
+        self.file.flush()
+        return self
+    def write(self, item):
+        if not self.is_first:
+            self.file.write(",\n")
+        json.dump(item, self.file, ensure_ascii=False, indent=2)
+        self.file.flush()
+        self.is_first = False
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.file is not None:
+            self.file.write("\n]\n")
+            self.file.close()
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def extract_checked_description(text):
+    match = re.search(
+        r"Checked description:\s*(.*?)\s*Issues:\s*",
+        text,
+        flags=re.DOTALL
+    )
+    if match:
+        return match.group(1).strip()
+    return ""
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    def __init__(self, processor, data_path):
+        self.processor = processor
+        self.conv = Conversation(
+            system='',
+            data_path=data_path
+        )
+    def __call__(self, data_dicts):
+        batch_prompts = []
+        batch_images = []
+        result_meta = []
+        for data_dict in data_dicts:
+            batch_images.append(data_dict['image'])
+            batch_prompts.append(self.conv.get_prompt(data_dict['meta']))
+            result_meta.append(data_dict['meta'])
+        batch_tensors = build_batch_tensors(
+            processor=self.processor,
+            prompts=batch_prompts,
+            images=batch_images,
+            system_prompt=self.conv.system,
+        )
+        return batch_tensors, result_meta
+@torch.no_grad()
+def worker(model, processor, dataset, args):
+    rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    indices = list(range(rank, len(dataset), world_size))
+    print("==>" + " Worker {} Started, responsible for {} images".format(rank, len(indices)))
+    sub_dataset = torch.utils.data.Subset(dataset, indices)
+    batch_size = args.batch_size
+    data_loader = DataLoader(
+        sub_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path)
+    )
+    output_path = os.path.join(args.output_dir, f'examiner_labels_{rank}.json')
+    with StreamingJsonArrayWriter(output_path) as writer:
+        for batch_tensors, result_meta in tqdm(data_loader):
+            input_ids = batch_tensors['input_ids'].cuda()
+            batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
+            with torch.inference_mode():
+                output_dict = model.generate(
+                    do_sample=False,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=args.max_new_tokens,
+                    output_logits=True,
+                    **batch_tensors,
+                )
+                output_ids = output_dict['sequences']
+            for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
+                input_token_len = input_id.shape[0]
+                n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
+                output = decode_generated_text(processor, output_id, input_id)
+                meta['examiner_result'] = output
+                meta['final_description'] = extract_checked_description(output)
+                writer.write(meta)
+def eval_model(args):
+    dist.init_process_group(backend='nccl')
+    rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    print('Init process group: world_size: {}, rank: {}'.format(world_size, rank))
+    torch.cuda.set_device(rank)
+    disable_torch_init()
+    backend_name, model, processor = load_model_and_processor(
+        model_path=args.model_path,
+        backend=args.model_backend,
+        torch_dtype=args.torch_dtype,
+        trust_remote_code=True,
+    )
+    print(f'Using model backend: {backend_name}')
+    model = model.cuda()
+    model.eval()
+    dataset = PoseHICODetDataset(
+        data_path=args.data_path,
+        multimodal_cfg=dict(
+            image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
+            data_augmentation=False,
+            image_size=336,
+        ),
+        annotation_path=args.annotation_path,
+        max_samples=args.max_samples,
+    )
+    worker(model, processor, dataset, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--data-path", type=str, default="")
+    parser.add_argument("--annotation-path", type=str, default="./outputs/merged_labels.json")
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--max-samples", type=int, default=0)
+    parser.add_argument("--model-backend", type=str, default="auto")
+    parser.add_argument("--torch-dtype", type=str, default="bfloat16")
+    args = parser.parse_args()
+    eval_model(args)

tools/merge_json_outputs.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import json
+import glob
+import argparse
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-dir", type=str, required=True)
+    parser.add_argument("--pattern", type=str, required=True)
+    parser.add_argument("--output-path", type=str, required=True)
+    args = parser.parse_args()
+    input_pattern = os.path.join(args.input_dir, args.pattern)
+    input_paths = sorted(glob.glob(input_pattern))
+    if not input_paths:
+        raise FileNotFoundError(f"No files matched pattern: {input_pattern}")
+    merged = []
+    for path in input_paths:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, list):
+            raise ValueError(f"{path} is not a JSON list, got {type(data)}")
+        merged.extend(data)
+    output_dir = os.path.dirname(args.output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    with open(args.output_path, "w", encoding="utf-8") as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+    print(f"Merged {len(input_paths)} files into {args.output_path}")
+    print(f"Total items: {len(merged)}")
+if __name__ == "__main__":
+    main()

tools/refine_hico.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import json
+import argparse
+from tqdm import tqdm
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from data.dataset_for_clean_descrip import PoseHICODetDataset
+from data.convsersation import Conversation_For_Clean_Descrption as Conversation
+from dataclasses import dataclass
+from tools.vlm_backend import build_batch_tensors, decode_generated_text, load_model_and_processor
+class StreamingJsonArrayWriter:
+    def __init__(self, output_path):
+        self.output_path = output_path
+        self.file = None
+        self.is_first = True
+    def __enter__(self):
+        self.file = open(self.output_path, "w", encoding="utf-8")
+        self.file.write("[\n")
+        self.file.flush()
+        return self
+    def write(self, item):
+        if not self.is_first:
+            self.file.write(",\n")
+        json.dump(item, self.file, ensure_ascii=False, indent=2)
+        self.file.flush()
+        self.is_first = False
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.file is not None:
+            self.file.write("\n]\n")
+            self.file.close()
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    def __init__(self, processor, data_path):
+        self.processor = processor
+        self.conv = Conversation(
+            system='',
+            data_path=data_path
+        )
+    def __call__(self, data_dicts):
+        batch_prompts = []
+        batch_images = []
+        result_meta = []
+        for data_dict in data_dicts:
+            batch_images.append(data_dict['image'])
+            batch_prompts.append(self.conv.get_prompt(data_dict['meta']))
+            result_meta.append(data_dict['meta'])
+        batch_tensors = build_batch_tensors(
+            processor=self.processor,
+            prompts=batch_prompts,
+            images=batch_images,
+            system_prompt=self.conv.system,
+        )
+        return batch_tensors, result_meta
+@torch.no_grad()
+def worker(model, processor, dataset, args):
+    rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    indices = list(range(rank, len(dataset), world_size))
+    print("==>" + " Worker {} Started, responsible for {} images".format(rank, len(indices)))
+    sub_dataset = torch.utils.data.Subset(dataset, indices)
+    data_loader = DataLoader(
+        sub_dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path)
+    )
+    output_path = os.path.join(args.output_dir, f'refine_labels_{rank}.json')
+    with StreamingJsonArrayWriter(output_path) as writer:
+        for batch_tensors, result_meta in tqdm(data_loader):
+            input_ids = batch_tensors['input_ids'].cuda()
+            batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
+            with torch.inference_mode():
+                output_dict = model.generate(
+                    do_sample=False,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=args.max_new_tokens,
+                    output_logits=True,
+                    **batch_tensors,
+                )
+                output_ids = output_dict['sequences']
+            for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
+                input_token_len = input_id.shape[0]
+                n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
+                output = decode_generated_text(processor, output_id, input_id)
+                meta['refined_description'] = output
+                writer.write(meta)
+def eval_model(args):
+    dist.init_process_group(backend='nccl')
+    rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    print('Init process group: world_size: {}, rank: {}'.format(world_size, rank))
+    torch.cuda.set_device(rank)
+    disable_torch_init()
+    backend_name, model, processor = load_model_and_processor(
+        model_path=args.model_path,
+        backend=args.model_backend,
+        torch_dtype=args.torch_dtype,
+        trust_remote_code=True,
+    )
+    print(f'Using model backend: {backend_name}')
+    model = model.cuda()
+    model.eval()
+    dataset = PoseHICODetDataset(
+        data_path=args.data_path,
+        multimodal_cfg=dict(
+            image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
+            data_augmentation=False,
+            image_size=336,
+        ),
+        annotation_path=args.annotation_path,
+        max_samples=args.max_samples,
+    )
+    worker(model, processor, dataset, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--data-path", type=str, default="")
+    parser.add_argument("--annotation-path", type=str, default="./outputs/merged_labels.json")
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--max-samples", type=int, default=0)
+    parser.add_argument("--model-backend", type=str, default="auto")
+    parser.add_argument("--torch-dtype", type=str, default="bfloat16")
+    args = parser.parse_args()
+    eval_model(args)

tools/vlm_backend.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from typing import List, Tuple
+def _get_transformers():
+    import transformers
+    return transformers
+def resolve_torch_dtype(dtype_name):
+    import torch
+    if dtype_name == "auto":
+        return "auto"
+    if not hasattr(torch, dtype_name):
+        raise ValueError(f"Unsupported torch dtype: {dtype_name}")
+    return getattr(torch, dtype_name)
+def infer_model_backend(model_path, backend="auto", trust_remote_code=True):
+    if backend != "auto":
+        return backend
+    transformers = _get_transformers()
+    config = transformers.AutoConfig.from_pretrained(
+        model_path,
+        trust_remote_code=trust_remote_code
+    )
+    architectures = [arch.lower() for arch in (getattr(config, "architectures", None) or [])]
+    model_type = str(getattr(config, "model_type", "")).lower()
+    arch_text = " ".join(architectures)
+    if "qwen3vlmoe" in arch_text or ("qwen" in model_type and "moe" in arch_text):
+        return "qwen3_vl_moe"
+    if "qwen3vl" in arch_text or ("qwen" in model_type and "vl" in model_type):
+        return "qwen3_vl"
+    if "llava" in arch_text or "llava" in model_type:
+        return "llava"
+    if "deepseek" in arch_text or "deepseek" in model_type or "janus" in arch_text or "janus" in model_type:
+        return "deepseek_vl"
+    return "hf_vision2seq"
+def load_model_and_processor(
+    model_path,
+    backend="auto",
+    torch_dtype="bfloat16",
+    trust_remote_code=True,
+):
+    transformers = _get_transformers()
+    backend = infer_model_backend(
+        model_path=model_path,
+        backend=backend,
+        trust_remote_code=trust_remote_code,
+    )
+    dtype = resolve_torch_dtype(torch_dtype)
+    if backend == "qwen3_vl":
+        model_cls = transformers.Qwen3VLForConditionalGeneration
+    elif backend == "qwen3_vl_moe":
+        model_cls = transformers.Qwen3VLMoeForConditionalGeneration
+    elif backend == "llava":
+        model_cls = getattr(transformers, "LlavaForConditionalGeneration", None)
+        if model_cls is None:
+            model_cls = transformers.AutoModelForVision2Seq
+    elif backend == "deepseek_vl":
+        # DeepSeek multimodal checkpoints often rely on trust_remote_code and may expose
+        # custom causal-LM style classes instead of Vision2Seq classes.
+        model_cls = transformers.AutoModelForCausalLM
+    elif backend == "hf_vision2seq":
+        model_cls = transformers.AutoModelForVision2Seq
+    elif backend == "hf_causal_vlm":
+        model_cls = transformers.AutoModelForCausalLM
+    else:
+        raise ValueError(f"Unsupported model backend: {backend}")
+    model = model_cls.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        trust_remote_code=trust_remote_code,
+    )
+    processor = transformers.AutoProcessor.from_pretrained(
+        model_path,
+        trust_remote_code=trust_remote_code,
+    )
+    _configure_processor(processor)
+    return backend, model, processor
+def _configure_processor(processor):
+    tokenizer = getattr(processor, "tokenizer", None)
+    if tokenizer is None:
+        return
+    if getattr(tokenizer, "padding_side", None) is not None:
+        tokenizer.padding_side = "left"
+    if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+def build_batch_tensors(processor, prompts: List[str], images, system_prompt=""):
+    messages = []
+    for prompt in prompts:
+        messages.append([
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": system_prompt},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt},
+                ],
+            },
+        ])
+    rendered_prompts = []
+    if hasattr(processor, "apply_chat_template"):
+        rendered_prompts = [
+            processor.apply_chat_template(
+                message,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            for message in messages
+        ]
+    else:
+        tokenizer = getattr(processor, "tokenizer", None)
+        if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"):
+            rendered_prompts = [
+                tokenizer.apply_chat_template(
+                    message,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                for message in messages
+            ]
+        else:
+            rendered_prompts = prompts
+    try:
+        return processor(
+            text=rendered_prompts,
+            images=images,
+            return_tensors="pt",
+            padding=True,
+        )
+    except TypeError:
+        return processor(
+            text=rendered_prompts,
+            images=images,
+            return_tensors="pt",
+        )
+def decode_generated_text(processor, output_ids, prompt_input_ids):
+    tokenizer = getattr(processor, "tokenizer", processor)
+    input_token_len = prompt_input_ids.shape[0]
+    return tokenizer.batch_decode(
+        output_ids[input_token_len:].unsqueeze(0),
+        skip_special_tokens=True
+    )[0]