Update the dataset of COCO

Browse files

Files changed (12) hide show

data/__init__.py +0 -3
data/__pycache__/__init__.cpython-311.pyc +0 -0
data/__pycache__/constants.cpython-311.pyc +0 -0
data/__pycache__/convsersation.cpython-311.pyc +0 -0
data/__pycache__/dataset_for_clean_descrip.cpython-311.pyc +0 -0
data/__pycache__/hicodet.cpython-311.pyc +0 -0
data/__pycache__/pose_coco.cpython-311.pyc +0 -0
data/__pycache__/pose_hicodet.cpython-311.pyc +0 -0
data/convsersation.py +314 -0
data/dataset_for_clean_descrip.py +228 -0
data/hicodet.py +0 -294
data/pose_coco.py +279 -0

data/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .hicodet import HICODet
-__all__ = ["HICODet"]

data/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/data/__pycache__/__init__.cpython-311.pyc and b/data/__pycache__/__init__.cpython-311.pyc differ

data/__pycache__/constants.cpython-311.pyc CHANGED Viewed

Binary files a/data/__pycache__/constants.cpython-311.pyc and b/data/__pycache__/constants.cpython-311.pyc differ

data/__pycache__/convsersation.cpython-311.pyc CHANGED Viewed

Binary files a/data/__pycache__/convsersation.cpython-311.pyc and b/data/__pycache__/convsersation.cpython-311.pyc differ

data/__pycache__/dataset_for_clean_descrip.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

data/__pycache__/hicodet.cpython-311.pyc CHANGED Viewed

Binary files a/data/__pycache__/hicodet.cpython-311.pyc and b/data/__pycache__/hicodet.cpython-311.pyc differ

data/__pycache__/pose_coco.cpython-311.pyc ADDED Viewed

Binary file (14.9 kB). View file

data/__pycache__/pose_hicodet.cpython-311.pyc CHANGED Viewed

Binary files a/data/__pycache__/pose_hicodet.cpython-311.pyc and b/data/__pycache__/pose_hicodet.cpython-311.pyc differ

data/convsersation.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import dataclasses
 from enum import auto, Enum
@@ -93,13 +94,326 @@ class Conversation:
                Use these cues as guidance. Only mention cues you can actually see in the image.
         """
         return prompt
 if __name__ == "__main__":
    pass

+import re
 import os
 import dataclasses
 from enum import auto, Enum
                Use these cues as guidance. Only mention cues you can actually see in the image.
         """
         return prompt
+@dataclasses.dataclass
+class Conversation_For_Clean_Descrption:
+    def __init__(self, system='', data_path=''):
+        super().__init__()
+        if system == '':
+           self.system = f"""
+        You are a strict verifier and editor for pose-grounded action descriptions.
+        You will be given:
+        - Ground-truth action label(s) (GT).
+        - A candidate description (may be verbose or include irrelevant evidence).
+        - A closed list of allowed keypoint/body-part names.
+        - A replacement mapping (e.g., hand→wrist, foot→ankle).
+        Rules:
+        1) First, check whether the candidate’s stated action matches the GT action(s).
+        2) Then rewrite the description into exactly 2–3 sentences:
+        - The first sentence must state the GT action (not the candidate action if it differs).
+        - Keep only evidence that supports the GT action; delete unrelated evidence.
+        - If a joint is mentioned both sides ALWAYS write as "left_wrist and right_wrist", "left_hip and right_hip", "left_ankle and right_ankle", etc.
+        - When mentioning body parts/keypoints, you MUST use only names from the allowed list (exact match).
+        - Apply the replacement mapping strictly; never output disallowed synonyms like “hand/foot” if they map to allowed names.
+        - Do not add new evidence; only keep/condense evidence already present in the candidate.
+        - A MUST-KEEP hint: required (joint, part_action) items that must appear in the final description (joint names may need replacement).
+        Output format (plain text only): The refined 2–3 sentence description.
+        No other text.
+           """
+        else:
+            self.system = system
+        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
+        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
+    def _replace_part_names(self, text):
+        REPL = {
+                "hand": "wrist",
+                "hands": "wrists",
+                "foot": "ankle",
+                "feet": "ankles",
+            }
+        pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
+        def _sub(m):
+            w = m.group(0)
+            out = REPL[w.lower()]
+            # keep capitalization if you care
+            if w[0].isupper():
+                out = out.capitalize()
+            return out
+        return pattern.sub(_sub, text)
+    def _humanpart2word(self, action_labels):
+        action_labels_in_words = []
+        part_state_keys = list(self.part_state_reference.keys())
+        for d in action_labels:
+            human_part_id = d['human_part']
+            part_state_id = d['partstate']
+            part_name = PART_ORDER[human_part_id]
+            for key in part_state_keys:
+                if key in part_name:
+                    states = self.part_state_reference[key]
+                    part_state = states[part_state_id]
+            part_name = self._replace_part_names(part_name)
+            action_labels_in_words.append([part_name, part_state])
+        return action_labels_in_words
+    def _actionid2word(self, hoi_id):
+        obj, act = self.hoi_reference[hoi_id]
+        return obj, act
+    def get_prompt(self, meta):
+        hoi_id = meta['hoi_id']
+        obj_in_word, act_in_word = self._actionid2word(hoi_id)
+        action_labels = meta['action_labels']
+        action_labels_in_words = self._humanpart2word(action_labels)
+        description = meta['description']
+        description = self._replace_part_names(description)
+        prompt = f"""
+               GT action(s): {act_in_word, obj_in_word}
+               Allowed keypoint names:
+               {COCO_KEYPOINT_NAME}
+               Replacement mapping:
+               "hand" to "wrist", "foot" to "ankle"
+               Candidate description:
+               {description}
+               Must-KEEP Hint:
+               {action_labels_in_words}
+                Please follow the system rules and output in the required plain-text format.
+        """
+        return prompt
+@dataclasses.dataclass
+class Conversation_For_Clean_Evidence:
+    def __init__(self, system='', data_path=''):
+        super().__init__()
+        if system == '':
+           self.system = f"""
+            You rewrite descriptions into NATURAL LANGUAGE evidence-only text.
+            Output rules:
+            - Write 2–3 complete sentences in natural English.
+            - Do NOT mention the action or the subject (no "person", "he", "she", "they", "main", etc.).
+            - Only describe evidence involving body parts/keypoints and part-level motions/contacts.
+            - Every sentence must include at least one keypoint name from the allowed list (exact match).
+            - Only use keypoint names from the allowed list; no other body-part words.
+            - Never use generic joints (e.g., "wrist", "hip", "ankle") alone; If both sides are mentioned, use "left_wrist and right_wrist", "left_hip and right_hip", etc.
+            - Apply the replacement mapping first (hand→wrist, foot→ankle, etc.), then enforce left/right by writing both sides.
+            - Keep only evidence supported by the candidate; do not add new details.
+            Style variety requirement:
+            - Write like a reasoning use normal grammar, not lists, not "keypoint: ...".
+            - Do not use the same starter phrase or the same connector in both sentences.
+            - Example reasoning patterns (Can invent your own, but use different pattern):
+            A) “With <keypoints/evidence>, <interpretation>.”  (no “suggesting/indicating”)
+            B) “<Interpretation>; evidence includes <keypoints/evidence>.”  (semicolon style)
+            C) “This is supported by <keypoints/evidence>, which <effect/constraint>.”  (“supported by” style)
+            D) “Notably, <keypoints/evidence>; this points to <interpretation>.”  (“notably/points to” style)
+            E) “<Keypoints/evidence> form(s) <configuration>, consistent with <interpretation>.” (“configuration” style)
+           """
+        else:
+            self.system = system
+        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
+        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
+    def _replace_part_names(self, text):
+        REPL = {
+                "hand": "wrist",
+                "hands": "wrists",
+                "foot": "ankle",
+                "feet": "ankles",
+            }
+        pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
+        def _sub(m):
+            w = m.group(0)
+            out = REPL[w.lower()]
+            # keep capitalization if you care
+            if w[0].isupper():
+                out = out.capitalize()
+            return out
+        return pattern.sub(_sub, text)
+    def _humanpart2word(self, action_labels):
+        action_labels_in_words = []
+        part_state_keys = list(self.part_state_reference.keys())
+        for d in action_labels:
+            human_part_id = d['human_part']
+            part_state_id = d['partstate']
+            part_name = PART_ORDER[human_part_id]
+            for key in part_state_keys:
+                if key in part_name:
+                    states = self.part_state_reference[key]
+                    part_state = states[part_state_id]
+            part_name = self._replace_part_names(part_name)
+            action_labels_in_words.append([part_name, part_state])
+        return action_labels_in_words
+    def _actionid2word(self, hoi_id):
+        obj, act = self.hoi_reference[hoi_id]
+        return obj, act
+    def get_prompt(self, meta):
+        hoi_id = meta['hoi_id']
+        obj_in_word, act_in_word = self._actionid2word(hoi_id)
+        action_labels = meta['action_labels']
+        action_labels_in_words = self._humanpart2word(action_labels)
+        description = meta['short_description']
+        description = self._replace_part_names(description)
+        prompt = f"""
+               GT action(s): {act_in_word, obj_in_word}
+               Allowed keypoint names:
+               {COCO_KEYPOINT_NAME}
+               Replacement mapping:
+               "hand" to "wrist", "foot" to "ankle"
+               Candidate description:
+               {description}
+               Must-KEEP Hint:
+               {action_labels_in_words}
+                Please follow the system rules and output in the required plain-text format.
+        """
+        return prompt
+@dataclasses.dataclass
+class Conversation_For_Action_Pharse:
+    def __init__(self, system='', data_path=''):
+        super().__init__()
+        if system == '':
+           self.system = f"""
+           You are a visual captioning assistant.
+           Given an image and an action hint in the form [VERB, OBJECT], output exactly one short English sentence describing that action in the image.
+           Rules:
+                •	Use only the provided VERB and OBJECT (you may adjust grammar: holds/holding; a/the; plural if needed).
+                •	Output one sentence only.
+                •	No extra details (no location, colors, emotions, reasons, scene context).
+                •	No punctuation beyond the final period.
+                •	If the subject is a person, use “The person” (not “man/woman/boy/girl”).
+                •	If the action is not visible, still output a best-effort sentence using the hint.
+           """
+        else:
+            self.system = system
+        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
+        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
+    def _replace_part_names(self, text):
+        REPL = {
+                "hand": "wrist",
+                "hands": "wrists",
+                "foot": "ankle",
+                "feet": "ankles",
+            }
+        pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
+        def _sub(m):
+            w = m.group(0)
+            out = REPL[w.lower()]
+            # keep capitalization if you care
+            if w[0].isupper():
+                out = out.capitalize()
+            return out
+        return pattern.sub(_sub, text)
+    def _humanpart2word(self, action_labels):
+        action_labels_in_words = []
+        part_state_keys = list(self.part_state_reference.keys())
+        for d in action_labels:
+            human_part_id = d['human_part']
+            part_state_id = d['partstate']
+            part_name = PART_ORDER[human_part_id]
+            for key in part_state_keys:
+                if key in part_name:
+                    states = self.part_state_reference[key]
+                    part_state = states[part_state_id]
+            part_name = self._replace_part_names(part_name)
+            action_labels_in_words.append([part_name, part_state])
+        return action_labels_in_words
+    def _actionid2word(self, hoi_id):
+        obj, act = self.hoi_reference[hoi_id]
+        return obj, act
+    def get_prompt(self, meta):
+        hoi_id = meta['hoi_id']
+        obj_in_word, act_in_word = self._actionid2word(hoi_id)
+        action_labels = meta['action_labels']
+        action_labels_in_words = self._humanpart2word(action_labels)
+        description = meta['short_description']
+        description = self._replace_part_names(description)
+        prompt = f"""
+               Hints: {act_in_word, obj_in_word}
+               Write exactly one short sentence that follows the rules.
+        """
+        return prompt
+@dataclasses.dataclass
+class Conversation_For_COCO_Long_Description:
+    def __init__(self, system='', data_path=''):
+        super().__init__()
+        if system == '':
+           self.system = f"""
+           You are an AI assistant. You will be given an image that contains a main human subject.
+           Task:
+           Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.
+           Hints:
+           You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.
+           Required Constraints:
+           - Start with ONE sentence that summarizes the main action in natural language.
+           - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
+           - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
+           - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
+           - Write your description in clear, concise sentences grounded in visible evidence.
+           Optional Constraints :
+           - Write naturally. Avoid repeating the same sentence pattern.
+           - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
+           """
+        else:
+            self.system = system
+        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
+        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
+    def _replace_part_names(self, text):
+        REPL = {
+                "hand": "wrist",
+                "hands": "wrists",
+                "foot": "ankle",
+                "feet": "ankles",
+            }
+        pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
+        def _sub(m):
+            w = m.group(0)
+            out = REPL[w.lower()]
+            # keep capitalization if you care
+            if w[0].isupper():
+                out = out.capitalize()
+            return out
+        return pattern.sub(_sub, text)
+    def get_prompt(self, meta):
+        prompt = f"""
+               Hint: you may consider use the actions in the below dictionary {self.part_state_reference}
+               Given the image, describe the visual evidence (especially body parts) that supports the action.
+        """
+        return prompt
 if __name__ == "__main__":
    pass

data/dataset_for_clean_descrip.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import os
+import cv2
+import json
+import logging
+import random
+from typing import Dict
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+import numpy as np
+import transformers
+from pycocotools.coco import COCO
+from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion
+from .constants import COCO_KEYPOINT_NAME_TOKEN
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+PREFIX_IMAGE = "Image: "
+PREFIX_NO_IMAGE = "Image: N/A"
+BEGIN_DESCRIPTION = "<des>"
+END_DESCRIPTION = "</des>"
+IGNORE_INDEX = -100
+DEFAULT_EOS_TOKEN = "</s>"
+BEGIN_OPTIONS = "<opt>"
+END_OPTIONS = "</opt>"
+BEGIN_LOC = "<loc>"
+END_LOC = "</loc>"
+BEGIN_QUESTION = "<qes>"
+END_QUESTION = "</qes>"
+class PoseHICODetDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 multimodal_cfg: dict,
+                 ):
+        super(PoseHICODetDataset, self).__init__()
+        logging.warning("Loading data...")
+        self.multimodal_cfg = multimodal_cfg
+        self.mllm_image_size = multimodal_cfg['image_size']
+        self.aspect_ratio = 1.0
+        self.pixel_std = 200
+        self.num_joints = 17
+        self.num_joints_full_body = 136
+        self.list_data_dict = self._load_json('./outputs/merged_labels.json')
+        json_path = os.path.join(data_path, "Annotation/hico-det-instance-level/hico-det-training-set-instance-level.json")
+        with open(json_path, "r", encoding="utf-8") as f:
+             hoi_data = json.load(f)
+        self.hoi_data = hoi_data
+    def _load_json(self, data_path):
+        with open(data_path, 'r', encoding="utf-8") as f:
+            data_list = json.load(f)
+        return data_list
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i):
+        sources = self.list_data_dict[i]
+        image = self._get_image_item(sources)
+        hoi_id = self._find_hoi_id(sources)
+        assert hoi_id != -1
+        sources['hoi_id'] = hoi_id
+        data_dict = {}
+        data_dict['image'] = image
+        data_dict['meta'] = sources
+        return data_dict
+    def _get_image_item(self, sources):
+        file_name = sources['file_name']
+        image_folder = self.multimodal_cfg['image_folder']
+        image_file = os.path.join(image_folder, file_name)
+        image = cv2.imread(
+            image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
+        )
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # process image
+        joints = sources['keypoints']
+        joints_vis = sources['vis']
+        x1, y1, x2, y2 = sources['human_bbox']
+        w, h = x2-x1, y2-y1
+        c, s = self._xywh2cs(x1, y1, w, h)
+        r = 0
+        trans = get_affine_transform(c, s, r, (int(self.mllm_image_size), int(self.mllm_image_size)))
+        image = cv2.warpAffine(
+            image,
+            trans,
+            (int(self.mllm_image_size), int(self.mllm_image_size)),
+            flags=cv2.INTER_LINEAR)
+        return image
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            # scale = scale * 1.25
+            scale = scale * 1.0
+        return center, scale
+    def _match_action_labels(self, src_action_labels, action_labels):
+        is_match = False
+        if len(src_action_labels) != len(action_labels):
+            return is_match
+        else:
+            exsistance = []
+            for new_item in src_action_labels:
+                exists = any(d.get("human_part") == new_item["human_part"] and d.get("partstate") == new_item["partstate"] for d in action_labels)
+                exsistance.append(exists)
+            is_match = all(exsistance)
+            return is_match
+    def _find_hoi_id(self, sources):
+        file_name = sources['file_name']
+        hoi_data = self.hoi_data[file_name]
+        hoi_labels = hoi_data['labels']
+        hoi_id = -1
+        src_action_labels = sources['action_labels']
+        for dic in hoi_labels:
+            action_labels = dic['action_labels']
+            #human_bbox = dic['human_bbox']
+            hoi_id = dic['hoi_id']
+            is_a_member = self._match_action_labels(src_action_labels=src_action_labels, action_labels=action_labels)
+            if is_a_member:
+                return hoi_id
+        return hoi_id
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+    return joints*joints_vis, joints_vis
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+def get_affine_transform(
+        center, scale, rot, output_size,
+        shift=np.array([0, 0], dtype=np.float32), inv=0
+):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+    return src_result

data/hicodet.py DELETED Viewed

@@ -1,294 +0,0 @@
-"""
-HICODet dataset under PyTorch framework
-Fred Zhang <frederic.zhang@anu.edu.au>
-The Australian National University
-Australian Centre for Robotic Vision
-"""
-import os
-import json
-import numpy as np
-from typing import Optional, List, Callable, Tuple
-from pocket.data import ImageDataset, DataSubset
-class HICODetSubset(DataSubset):
-    def __init__(self, *args) -> None:
-        super().__init__(*args)
-    def filename(self, idx: int) -> str:
-        """Override: return the image file name in the subset"""
-        return self._filenames[self._idx[self.pool[idx]]]
-    def image_size(self, idx: int) -> Tuple[int, int]:
-        """Override: return the size (width, height) of an image in the subset"""
-        return self._image_sizes[self._idx[self.pool[idx]]]
-    @property
-    def anno_interaction(self) -> List[int]:
-        """Override: Number of annotated box pairs for each interaction class"""
-        num_anno = [0 for _ in range(self.num_interation_cls)]
-        intra_idx = [self._idx[i] for i in self.pool]
-        for idx in intra_idx:
-            for hoi in self._anno[idx]['hoi']:
-                num_anno[hoi] += 1
-        return num_anno
-    @property
-    def anno_object(self) -> List[int]:
-        """Override: Number of annotated box pairs for each object class"""
-        num_anno = [0 for _ in range(self.num_object_cls)]
-        anno_interaction = self.anno_interaction
-        for corr in self._class_corr:
-            num_anno[corr[1]] += anno_interaction[corr[0]]
-        return num_anno
-    @property
-    def anno_action(self) -> List[int]:
-        """Override: Number of annotated box pairs for each action class"""
-        num_anno = [0 for _ in range(self.num_action_cls)]
-        anno_interaction = self.anno_interaction
-        for corr in self._class_corr:
-            num_anno[corr[2]] += anno_interaction[corr[0]]
-        return num_anno
-class HICODet(ImageDataset):
-    """
-    Arguments:
-        root(str): Root directory where images are downloaded to
-        anno_file(str): Path to json annotation file
-        transform(callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version
-        target_transform(callable, optional): A function/transform that takes in the
-            target and transforms it
-        transforms (callable, optional): A function/transform that takes input sample
-            and its target as entry and returns a transformed version.
-    """
-    def __init__(self, root: str, anno_file: str,
-            transform: Optional[Callable] = None,
-            target_transform: Optional[Callable] = None,
-            transforms: Optional[Callable] = None) -> None:
-        super(HICODet, self).__init__(root, transform, target_transform, transforms)
-        with open(anno_file, 'r') as f:
-            anno = json.load(f)
-        import pdb;pdb.set_trace()
-        self.num_object_cls = 80
-        self.num_interation_cls = 600
-        self.num_action_cls = 117
-        self._anno_file = anno_file
-        # Load annotations
-        self._load_annotation_and_metadata(anno)
-    def __len__(self) -> int:
-        """Return the number of images"""
-        return len(self._idx)
-    def __getitem__(self, i: int) -> tuple:
-        """
-        Arguments:
-            i(int): Index to an image
-        Returns:
-            tuple[image, target]: By default, the tuple consists of a PIL image and a
-                dict with the following keys:
-                    "boxes_h": list[list[4]]
-                    "boxes_o": list[list[4]]
-                    "hoi":: list[N]
-                    "verb": list[N]
-                    "object": list[N]
-        """
-        intra_idx = self._idx[i]
-        return self._transforms(
-            self.load_image(os.path.join(self._root, self._filenames[intra_idx])),
-            self._anno[intra_idx]
-            )
-    def __repr__(self) -> str:
-        """Return the executable string representation"""
-        reprstr = self.__class__.__name__ + '(root=' + repr(self._root)
-        reprstr += ', anno_file='
-        reprstr += repr(self._anno_file)
-        reprstr += ')'
-        # Ignore the optional arguments
-        return reprstr
-    def __str__(self) -> str:
-        """Return the readable string representation"""
-        reprstr = 'Dataset: ' + self.__class__.__name__ + '\n'
-        reprstr += '\tNumber of images: {}\n'.format(self.__len__())
-        reprstr += '\tImage directory: {}\n'.format(self._root)
-        reprstr += '\tAnnotation file: {}\n'.format(self._root)
-        return reprstr
-    @property
-    def annotations(self) -> List[dict]:
-        return self._anno
-    @property
-    def class_corr(self) -> List[Tuple[int, int, int]]:
-        """
-        Class correspondence matrix in zero-based index
-        [
-            [hoi_idx, obj_idx, verb_idx],
-            ...
-        ]
-        Returns:
-            list[list[3]]
-        """
-        return self._class_corr.copy()
-    @property
-    def object_n_verb_to_interaction(self) -> List[list]:
-        """
-        The interaction classes corresponding to an object-verb pair
-        HICODet.object_n_verb_to_interaction[obj_idx][verb_idx] gives interaction class
-        index if the pair is valid, None otherwise
-        Returns:
-            list[list[117]]
-        """
-        lut = np.full([self.num_object_cls, self.num_action_cls], None)
-        for i, j, k in self._class_corr:
-            lut[j, k] = i
-        return lut.tolist()
-    @property
-    def object_to_interaction(self) -> List[list]:
-        """
-        The interaction classes that involve each object type
-        Returns:
-            list[list]
-        """
-        obj_to_int = [[] for _ in range(self.num_object_cls)]
-        for corr in self._class_corr:
-            obj_to_int[corr[1]].append(corr[0])
-        return obj_to_int
-    @property
-    def object_to_verb(self) -> List[list]:
-        """
-        The valid verbs for each object type
-        Returns:
-            list[list]
-        """
-        obj_to_verb = [[] for _ in range(self.num_object_cls)]
-        for corr in self._class_corr:
-            obj_to_verb[corr[1]].append(corr[2])
-        return obj_to_verb
-    @property
-    def anno_interaction(self) -> List[int]:
-        """
-        Number of annotated box pairs for each interaction class
-        Returns:
-            list[600]
-        """
-        return self._num_anno.copy()
-    @property
-    def anno_object(self) -> List[int]:
-        """
-        Number of annotated box pairs for each object class
-        Returns:
-            list[80]
-        """
-        num_anno = [0 for _ in range(self.num_object_cls)]
-        for corr in self._class_corr:
-            num_anno[corr[1]] += self._num_anno[corr[0]]
-        return num_anno
-    @property
-    def anno_action(self) -> List[int]:
-        """
-        Number of annotated box pairs for each action class
-        Returns:
-            list[117]
-        """
-        num_anno = [0 for _ in range(self.num_action_cls)]
-        for corr in self._class_corr:
-            num_anno[corr[2]] += self._num_anno[corr[0]]
-        return num_anno
-    @property
-    def objects(self) -> List[str]:
-        """
-        Object names
-        Returns:
-            list[str]
-        """
-        return self._objects.copy()
-    @property
-    def verbs(self) -> List[str]:
-        """
-        Verb (action) names
-        Returns:
-            list[str]
-        """
-        return self._verbs.copy()
-    @property
-    def interactions(self) -> List[str]:
-        """
-        Combination of verbs and objects
-        Returns:
-            list[str]
-        """
-        return [self._verbs[j] + ' ' + self.objects[i]
-            for _, i, j in self._class_corr]
-    def split(self, ratio: float) -> Tuple[HICODetSubset, HICODetSubset]:
-        """
-        Split the dataset according to given ratio
-        Arguments:
-            ratio(float): The percentage of training set between 0 and 1
-        Returns:
-            train(Dataset)
-            val(Dataset)
-        """
-        perm = np.random.permutation(len(self._idx))
-        n = int(len(perm) * ratio)
-        return HICODetSubset(self, perm[:n]), HICODetSubset(self, perm[n:])
-    def filename(self, idx: int) -> str:
-        """Return the image file name given the index"""
-        return self._filenames[self._idx[idx]]
-    def image_size(self, idx: int) -> Tuple[int, int]:
-        """Return the size (width, height) of an image"""
-        return self._image_sizes[self._idx[idx]]
-    def _load_annotation_and_metadata(self, f: dict) -> None:
-        """
-        Arguments:
-            f(dict): Dictionary loaded from {anno_file}.json
-        """
-        idx = list(range(len(f['filenames'])))
-        for empty_idx in f['empty']:
-            idx.remove(empty_idx)
-        num_anno = [0 for _ in range(self.num_interation_cls)]
-        for anno in f['annotation']:
-            for hoi in anno['hoi']:
-                num_anno[hoi] += 1
-        self._idx = idx
-        self._num_anno = num_anno
-        self._anno = f['annotation']
-        self._filenames = f['filenames']
-        self._image_sizes = f['size']
-        self._class_corr = f['correspondence']
-        self._empty_idx = f['empty']
-        self._objects = f['objects']
-        self._verbs = f['verbs']

data/pose_coco.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import transformers
+from torch.utils.data import Dataset
+from torchvision import transforms
+import torch
+import logging
+import random
+from typing import Dict
+import os
+import numpy as np
+from pycocotools.coco import COCO
+import cv2
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+PREFIX_IMAGE = "Image: "
+PREFIX_NO_IMAGE = "Image: N/A"
+BEGIN_DESCRIPTION = "<des>"
+END_DESCRIPTION = "</des>"
+IGNORE_INDEX = -100
+DEFAULT_EOS_TOKEN = "</s>"
+from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+PREFIX_IMAGE = "Image: "
+PREFIX_NO_IMAGE = "Image: N/A"
+BEGIN_DESCRIPTION = "<des>"
+END_DESCRIPTION = "</des>"
+IGNORE_INDEX = -100
+DEFAULT_EOS_TOKEN = "</s>"
+BEGIN_OPTIONS = "<opt>"
+END_OPTIONS = "</opt>"
+BEGIN_LOC = "<loc>"
+END_LOC = "</loc>"
+BEGIN_QUESTION = "<qes>"
+END_QUESTION = "</qes>"
+class PoseCOCODataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 multimodal_cfg: dict,
+                 is_train=True,
+                 is_RL=False
+                 ):
+        super(PoseCOCODataset, self).__init__()
+        logging.warning("Loading data...")
+        self.size = multimodal_cfg['image_size']
+        self.aspect_ratio = 1.0
+        self.pixel_std = 200
+        self.num_joints = 17
+        coco = COCO(data_path)
+        list_data_dict = []
+        instance_id = 0
+        for index in coco.getImgIds():
+            im_ann = coco.loadImgs(index)[0]
+            width = im_ann['width']
+            height = im_ann['height']
+            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
+            objs = coco.loadAnns(annIds)
+            # sanitize bboxes
+            valid_objs = []
+            for obj in objs:
+                x, y, w, h = obj['bbox']
+                x1 = np.max((0, x))
+                y1 = np.max((0, y))
+                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
+                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
+                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                    obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1]
+                    valid_objs.append(obj)
+            objs = valid_objs
+            for obj in objs:
+                cls = obj['category_id']
+                if cls != 1: continue
+                # ignore objs without keypoints annotation
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32)
+                joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float32)
+                visible = np.zeros((self.num_joints), dtype=np.float32)
+                for ipt in range(self.num_joints):
+                    joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
+                    joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
+                    joints_3d[ipt, 2] = 0
+                    t_vis = obj['keypoints'][ipt * 3 + 2]
+                    visible[ipt] = t_vis
+                    if t_vis > 1:
+                        t_vis = 1
+                    joints_3d_vis[ipt, 0] = t_vis
+                    joints_3d_vis[ipt, 1] = t_vis
+                    joints_3d_vis[ipt, 2] = 0
+                center, scale = self._box2cs(obj['clean_bbox'][:4])
+                list_data_dict.append({
+                    'file_name': im_ann['file_name'],
+                    'image_id': index,
+                    'center': center,
+                    'scale': scale,
+                    'joints_3d': joints_3d,
+                    'joints_3d_vis': joints_3d_vis,
+                    'instance_id': instance_id,
+                    'human_bbox': obj['clean_bbox']
+                })
+                instance_id += 1
+        logging.warning("The number of training samples is {}".format(len(list_data_dict)))
+        logging.warning("Formatting inputs...Skip in lazy mode")
+        self.list_data_dict = list_data_dict
+        self.multimodal_cfg = multimodal_cfg
+        self.data_aug = False
+        self.is_train = is_train
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i):
+        return self._parse_data_item_val(i)
+    def _parse_data_item_val(self, i):
+        sources = self.list_data_dict[i]
+        result_dict = {}
+        image, joints, joints_vis, c, s,  file_name, image_size = self._get_pose_item(sources)
+        image_id = sources['image_id']
+        result_dict['image'] = image
+        result_dict['image_id'] = image_id
+        result_dict['c'] = c
+        result_dict['s'] = s
+        result_dict['joints'] = joints
+        result_dict['joints_vis'] = joints_vis
+        result_dict['file_name'] = file_name
+        result_dict['human_bbox'] = sources['human_bbox']
+        result_dict['image_size'] = image_size
+        return result_dict
+    def _get_pose_item(self, sources):
+        file_name = sources['file_name']
+        image_folder = self.multimodal_cfg['image_folder']
+        image_file = os.path.join(image_folder, file_name)
+        image = cv2.imread(
+            image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
+        )
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        h, w, c = image.shape
+        # process image
+        joints = sources['joints_3d']
+        joints_vis = sources['joints_3d_vis']
+        c = sources['center']
+        s = sources['scale']
+        r = 0
+        if self.data_aug:
+            sf = 0.3
+            rf = 40
+            s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+            r = random.uniform(-rf, rf) if random.random() <= 0.5 else 0
+            flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+            # flip
+            if random.random() <= 0.5:
+                image = image[:, ::-1, :]
+                joints, joints_vis = fliplr_joints(
+                    joints, joints_vis, image.shape[1], flip_pairs)
+                c[0] = image.shape[1] - c[0] - 1
+        trans = get_affine_transform(c, s, r, (int(self.size), int(self.size)))
+        image = cv2.warpAffine(
+            image,
+            trans,
+            (int(self.size), int(self.size)),
+            flags=cv2.INTER_LINEAR)
+        for i in range(self.num_joints):
+            if joints_vis[i, 0] > 0.0:
+                joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+        return image, joints, joints_vis, c, s,  file_name, [h,w]
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.0
+        return center, scale
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+    return joints*joints_vis, joints_vis
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+def get_affine_transform(
+        center, scale, rot, output_size,
+        shift=np.array([0, 0], dtype=np.float32), inv=0
+):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+    return src_result