ayh015 commited on
Commit
a3cb3a7
·
1 Parent(s): e3044d8

Update the dataset of COCO

Browse files
data/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .hicodet import HICODet
2
-
3
- __all__ = ["HICODet"]
 
 
 
 
data/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/data/__pycache__/__init__.cpython-311.pyc and b/data/__pycache__/__init__.cpython-311.pyc differ
 
data/__pycache__/constants.cpython-311.pyc CHANGED
Binary files a/data/__pycache__/constants.cpython-311.pyc and b/data/__pycache__/constants.cpython-311.pyc differ
 
data/__pycache__/convsersation.cpython-311.pyc CHANGED
Binary files a/data/__pycache__/convsersation.cpython-311.pyc and b/data/__pycache__/convsersation.cpython-311.pyc differ
 
data/__pycache__/dataset_for_clean_descrip.cpython-311.pyc ADDED
Binary file (12.8 kB). View file
 
data/__pycache__/hicodet.cpython-311.pyc CHANGED
Binary files a/data/__pycache__/hicodet.cpython-311.pyc and b/data/__pycache__/hicodet.cpython-311.pyc differ
 
data/__pycache__/pose_coco.cpython-311.pyc ADDED
Binary file (14.9 kB). View file
 
data/__pycache__/pose_hicodet.cpython-311.pyc CHANGED
Binary files a/data/__pycache__/pose_hicodet.cpython-311.pyc and b/data/__pycache__/pose_hicodet.cpython-311.pyc differ
 
data/convsersation.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import dataclasses
3
  from enum import auto, Enum
@@ -93,13 +94,326 @@ class Conversation:
93
  Use these cues as guidance. Only mention cues you can actually see in the image.
94
  """
95
  return prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
 
97
 
 
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
 
 
 
 
 
 
100
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  if __name__ == "__main__":
105
  pass
 
1
+ import re
2
  import os
3
  import dataclasses
4
  from enum import auto, Enum
 
94
  Use these cues as guidance. Only mention cues you can actually see in the image.
95
  """
96
  return prompt
97
+
98
+ @dataclasses.dataclass
99
+ class Conversation_For_Clean_Descrption:
100
+ def __init__(self, system='', data_path=''):
101
+ super().__init__()
102
+ if system == '':
103
+ self.system = f"""
104
+ You are a strict verifier and editor for pose-grounded action descriptions.
105
+
106
+ You will be given:
107
+ - Ground-truth action label(s) (GT).
108
+ - A candidate description (may be verbose or include irrelevant evidence).
109
+ - A closed list of allowed keypoint/body-part names.
110
+ - A replacement mapping (e.g., hand→wrist, foot→ankle).
111
+
112
+ Rules:
113
+ 1) First, check whether the candidate’s stated action matches the GT action(s).
114
+ 2) Then rewrite the description into exactly 2–3 sentences:
115
+ - The first sentence must state the GT action (not the candidate action if it differs).
116
+ - Keep only evidence that supports the GT action; delete unrelated evidence.
117
+ - If a joint is mentioned both sides ALWAYS write as "left_wrist and right_wrist", "left_hip and right_hip", "left_ankle and right_ankle", etc.
118
+ - When mentioning body parts/keypoints, you MUST use only names from the allowed list (exact match).
119
+ - Apply the replacement mapping strictly; never output disallowed synonyms like “hand/foot” if they map to allowed names.
120
+ - Do not add new evidence; only keep/condense evidence already present in the candidate.
121
+ - A MUST-KEEP hint: required (joint, part_action) items that must appear in the final description (joint names may need replacement).
122
+
123
+ Output format (plain text only): The refined 2–3 sentence description.
124
+ No other text.
125
+ """
126
+ else:
127
+ self.system = system
128
+
129
+ self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
130
+ self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
131
+
132
+ def _replace_part_names(self, text):
133
+ REPL = {
134
+ "hand": "wrist",
135
+ "hands": "wrists",
136
+ "foot": "ankle",
137
+ "feet": "ankles",
138
+ }
139
+ pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
140
+ def _sub(m):
141
+ w = m.group(0)
142
+ out = REPL[w.lower()]
143
+ # keep capitalization if you care
144
+ if w[0].isupper():
145
+ out = out.capitalize()
146
+ return out
147
+ return pattern.sub(_sub, text)
148
+
149
+ def _humanpart2word(self, action_labels):
150
+ action_labels_in_words = []
151
+ part_state_keys = list(self.part_state_reference.keys())
152
+ for d in action_labels:
153
+ human_part_id = d['human_part']
154
+ part_state_id = d['partstate']
155
+
156
+ part_name = PART_ORDER[human_part_id]
157
+ for key in part_state_keys:
158
+ if key in part_name:
159
+ states = self.part_state_reference[key]
160
+ part_state = states[part_state_id]
161
+
162
+ part_name = self._replace_part_names(part_name)
163
+ action_labels_in_words.append([part_name, part_state])
164
+ return action_labels_in_words
165
+
166
+ def _actionid2word(self, hoi_id):
167
+ obj, act = self.hoi_reference[hoi_id]
168
+ return obj, act
169
 
170
+ def get_prompt(self, meta):
171
+ hoi_id = meta['hoi_id']
172
+ obj_in_word, act_in_word = self._actionid2word(hoi_id)
173
+ action_labels = meta['action_labels']
174
+ action_labels_in_words = self._humanpart2word(action_labels)
175
 
176
+ description = meta['description']
177
+ description = self._replace_part_names(description)
178
 
179
+ prompt = f"""
180
+ GT action(s): {act_in_word, obj_in_word}
181
+ Allowed keypoint names:
182
+ {COCO_KEYPOINT_NAME}
183
+ Replacement mapping:
184
+ "hand" to "wrist", "foot" to "ankle"
185
+ Candidate description:
186
+ {description}
187
+ Must-KEEP Hint:
188
+ {action_labels_in_words}
189
+ Please follow the system rules and output in the required plain-text format.
190
+ """
191
+ return prompt
192
 
193
+ @dataclasses.dataclass
194
+ class Conversation_For_Clean_Evidence:
195
+ def __init__(self, system='', data_path=''):
196
+ super().__init__()
197
+ if system == '':
198
+ self.system = f"""
199
+ You rewrite descriptions into NATURAL LANGUAGE evidence-only text.
200
 
201
+ Output rules:
202
+ - Write 2–3 complete sentences in natural English.
203
+ - Do NOT mention the action or the subject (no "person", "he", "she", "they", "main", etc.).
204
+ - Only describe evidence involving body parts/keypoints and part-level motions/contacts.
205
+ - Every sentence must include at least one keypoint name from the allowed list (exact match).
206
+ - Only use keypoint names from the allowed list; no other body-part words.
207
+ - Never use generic joints (e.g., "wrist", "hip", "ankle") alone; If both sides are mentioned, use "left_wrist and right_wrist", "left_hip and right_hip", etc.
208
+ - Apply the replacement mapping first (hand→wrist, foot→ankle, etc.), then enforce left/right by writing both sides.
209
+ - Keep only evidence supported by the candidate; do not add new details.
210
 
211
+ Style variety requirement:
212
+ - Write like a reasoning use normal grammar, not lists, not "keypoint: ...".
213
+ - Do not use the same starter phrase or the same connector in both sentences.
214
+ - Example reasoning patterns (Can invent your own, but use different pattern):
215
+ A) “With <keypoints/evidence>, <interpretation>.” (no “suggesting/indicating”)
216
+ B) “<Interpretation>; evidence includes <keypoints/evidence>.” (semicolon style)
217
+ C) “This is supported by <keypoints/evidence>, which <effect/constraint>.” (“supported by” style)
218
+ D) “Notably, <keypoints/evidence>; this points to <interpretation>.” (“notably/points to” style)
219
+ E) “<Keypoints/evidence> form(s) <configuration>, consistent with <interpretation>.” (“configuration” style)
220
+ """
221
+ else:
222
+ self.system = system
223
+
224
+ self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
225
+ self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
226
 
227
+ def _replace_part_names(self, text):
228
+ REPL = {
229
+ "hand": "wrist",
230
+ "hands": "wrists",
231
+ "foot": "ankle",
232
+ "feet": "ankles",
233
+ }
234
+ pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
235
+ def _sub(m):
236
+ w = m.group(0)
237
+ out = REPL[w.lower()]
238
+ # keep capitalization if you care
239
+ if w[0].isupper():
240
+ out = out.capitalize()
241
+ return out
242
+ return pattern.sub(_sub, text)
243
+
244
+ def _humanpart2word(self, action_labels):
245
+ action_labels_in_words = []
246
+ part_state_keys = list(self.part_state_reference.keys())
247
+ for d in action_labels:
248
+ human_part_id = d['human_part']
249
+ part_state_id = d['partstate']
250
+
251
+ part_name = PART_ORDER[human_part_id]
252
+ for key in part_state_keys:
253
+ if key in part_name:
254
+ states = self.part_state_reference[key]
255
+ part_state = states[part_state_id]
256
+
257
+ part_name = self._replace_part_names(part_name)
258
+ action_labels_in_words.append([part_name, part_state])
259
+ return action_labels_in_words
260
+
261
+ def _actionid2word(self, hoi_id):
262
+ obj, act = self.hoi_reference[hoi_id]
263
+ return obj, act
264
+
265
+ def get_prompt(self, meta):
266
+ hoi_id = meta['hoi_id']
267
+ obj_in_word, act_in_word = self._actionid2word(hoi_id)
268
+ action_labels = meta['action_labels']
269
+ action_labels_in_words = self._humanpart2word(action_labels)
270
+
271
+ description = meta['short_description']
272
+ description = self._replace_part_names(description)
273
+ prompt = f"""
274
+ GT action(s): {act_in_word, obj_in_word}
275
+ Allowed keypoint names:
276
+ {COCO_KEYPOINT_NAME}
277
+ Replacement mapping:
278
+ "hand" to "wrist", "foot" to "ankle"
279
+ Candidate description:
280
+ {description}
281
+ Must-KEEP Hint:
282
+ {action_labels_in_words}
283
+ Please follow the system rules and output in the required plain-text format.
284
+ """
285
+ return prompt
286
+
287
+ @dataclasses.dataclass
288
+ class Conversation_For_Action_Pharse:
289
+ def __init__(self, system='', data_path=''):
290
+ super().__init__()
291
+ if system == '':
292
+ self.system = f"""
293
+ You are a visual captioning assistant.
294
+ Given an image and an action hint in the form [VERB, OBJECT], output exactly one short English sentence describing that action in the image.
295
+
296
+ Rules:
297
+ • Use only the provided VERB and OBJECT (you may adjust grammar: holds/holding; a/the; plural if needed).
298
+ • Output one sentence only.
299
+ • No extra details (no location, colors, emotions, reasons, scene context).
300
+ • No punctuation beyond the final period.
301
+ • If the subject is a person, use “The person” (not “man/woman/boy/girl”).
302
+ • If the action is not visible, still output a best-effort sentence using the hint.
303
+ """
304
+ else:
305
+ self.system = system
306
+
307
+ self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
308
+ self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
309
+
310
+ def _replace_part_names(self, text):
311
+ REPL = {
312
+ "hand": "wrist",
313
+ "hands": "wrists",
314
+ "foot": "ankle",
315
+ "feet": "ankles",
316
+ }
317
+ pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
318
+ def _sub(m):
319
+ w = m.group(0)
320
+ out = REPL[w.lower()]
321
+ # keep capitalization if you care
322
+ if w[0].isupper():
323
+ out = out.capitalize()
324
+ return out
325
+ return pattern.sub(_sub, text)
326
+
327
+ def _humanpart2word(self, action_labels):
328
+ action_labels_in_words = []
329
+ part_state_keys = list(self.part_state_reference.keys())
330
+ for d in action_labels:
331
+ human_part_id = d['human_part']
332
+ part_state_id = d['partstate']
333
+
334
+ part_name = PART_ORDER[human_part_id]
335
+ for key in part_state_keys:
336
+ if key in part_name:
337
+ states = self.part_state_reference[key]
338
+ part_state = states[part_state_id]
339
+
340
+ part_name = self._replace_part_names(part_name)
341
+ action_labels_in_words.append([part_name, part_state])
342
+ return action_labels_in_words
343
+
344
+ def _actionid2word(self, hoi_id):
345
+ obj, act = self.hoi_reference[hoi_id]
346
+ return obj, act
347
+
348
+ def get_prompt(self, meta):
349
+ hoi_id = meta['hoi_id']
350
+ obj_in_word, act_in_word = self._actionid2word(hoi_id)
351
+ action_labels = meta['action_labels']
352
+ action_labels_in_words = self._humanpart2word(action_labels)
353
+
354
+ description = meta['short_description']
355
+ description = self._replace_part_names(description)
356
+ prompt = f"""
357
+ Hints: {act_in_word, obj_in_word}
358
+ Write exactly one short sentence that follows the rules.
359
+ """
360
+ return prompt
361
+
362
+ @dataclasses.dataclass
363
+ class Conversation_For_COCO_Long_Description:
364
+ def __init__(self, system='', data_path=''):
365
+ super().__init__()
366
+ if system == '':
367
+ self.system = f"""
368
+ You are an AI assistant. You will be given an image that contains a main human subject.
369
+ Task:
370
+ Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.
371
+
372
+ Hints:
373
+ You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.
374
+
375
+ Required Constraints:
376
+ - Start with ONE sentence that summarizes the main action in natural language.
377
+ - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
378
+ - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
379
+ - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
380
+ - Write your description in clear, concise sentences grounded in visible evidence.
381
+
382
+ Optional Constraints :
383
+ - Write naturally. Avoid repeating the same sentence pattern.
384
+ - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
385
+ """
386
+ else:
387
+ self.system = system
388
+
389
+ self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
390
+ self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
391
+
392
+ def _replace_part_names(self, text):
393
+ REPL = {
394
+ "hand": "wrist",
395
+ "hands": "wrists",
396
+ "foot": "ankle",
397
+ "feet": "ankles",
398
+ }
399
+ pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE)
400
+ def _sub(m):
401
+ w = m.group(0)
402
+ out = REPL[w.lower()]
403
+ # keep capitalization if you care
404
+ if w[0].isupper():
405
+ out = out.capitalize()
406
+ return out
407
+ return pattern.sub(_sub, text)
408
+
409
+ def get_prompt(self, meta):
410
+
411
+ prompt = f"""
412
+ Hint: you may consider use the actions in the below dictionary {self.part_state_reference}
413
+ Given the image, describe the visual evidence (especially body parts) that supports the action.
414
+ """
415
+ return prompt
416
+
417
 
418
  if __name__ == "__main__":
419
  pass
data/dataset_for_clean_descrip.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import logging
5
+ import random
6
+ from typing import Dict
7
+
8
+ import torch
9
+ from torch.utils.data import Dataset
10
+ from torchvision import transforms
11
+ import numpy as np
12
+
13
+ import transformers
14
+ from pycocotools.coco import COCO
15
+
16
+ from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion
17
+ from .constants import COCO_KEYPOINT_NAME_TOKEN
18
+
19
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
20
+ PREFIX_IMAGE = "Image: "
21
+ PREFIX_NO_IMAGE = "Image: N/A"
22
+ BEGIN_DESCRIPTION = "<des>"
23
+ END_DESCRIPTION = "</des>"
24
+ IGNORE_INDEX = -100
25
+ DEFAULT_EOS_TOKEN = "</s>"
26
+ BEGIN_OPTIONS = "<opt>"
27
+ END_OPTIONS = "</opt>"
28
+ BEGIN_LOC = "<loc>"
29
+ END_LOC = "</loc>"
30
+ BEGIN_QUESTION = "<qes>"
31
+ END_QUESTION = "</qes>"
32
+
33
+ class PoseHICODetDataset(Dataset):
34
+ """Dataset for supervised fine-tuning."""
35
+ def __init__(self, data_path: str,
36
+ multimodal_cfg: dict,
37
+ ):
38
+ super(PoseHICODetDataset, self).__init__()
39
+ logging.warning("Loading data...")
40
+ self.multimodal_cfg = multimodal_cfg
41
+ self.mllm_image_size = multimodal_cfg['image_size']
42
+ self.aspect_ratio = 1.0
43
+ self.pixel_std = 200
44
+ self.num_joints = 17
45
+ self.num_joints_full_body = 136
46
+ self.list_data_dict = self._load_json('./outputs/merged_labels.json')
47
+
48
+ json_path = os.path.join(data_path, "Annotation/hico-det-instance-level/hico-det-training-set-instance-level.json")
49
+ with open(json_path, "r", encoding="utf-8") as f:
50
+ hoi_data = json.load(f)
51
+
52
+ self.hoi_data = hoi_data
53
+
54
+ def _load_json(self, data_path):
55
+ with open(data_path, 'r', encoding="utf-8") as f:
56
+ data_list = json.load(f)
57
+ return data_list
58
+
59
+ def __len__(self):
60
+ return len(self.list_data_dict)
61
+
62
+ def __getitem__(self, i):
63
+ sources = self.list_data_dict[i]
64
+ image = self._get_image_item(sources)
65
+ hoi_id = self._find_hoi_id(sources)
66
+ assert hoi_id != -1
67
+ sources['hoi_id'] = hoi_id
68
+
69
+ data_dict = {}
70
+ data_dict['image'] = image
71
+ data_dict['meta'] = sources
72
+
73
+ return data_dict
74
+
75
+ def _get_image_item(self, sources):
76
+ file_name = sources['file_name']
77
+ image_folder = self.multimodal_cfg['image_folder']
78
+ image_file = os.path.join(image_folder, file_name)
79
+ image = cv2.imread(
80
+ image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
81
+ )
82
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
83
+
84
+ # process image
85
+ joints = sources['keypoints']
86
+ joints_vis = sources['vis']
87
+ x1, y1, x2, y2 = sources['human_bbox']
88
+ w, h = x2-x1, y2-y1
89
+
90
+ c, s = self._xywh2cs(x1, y1, w, h)
91
+ r = 0
92
+
93
+ trans = get_affine_transform(c, s, r, (int(self.mllm_image_size), int(self.mllm_image_size)))
94
+ image = cv2.warpAffine(
95
+ image,
96
+ trans,
97
+ (int(self.mllm_image_size), int(self.mllm_image_size)),
98
+ flags=cv2.INTER_LINEAR)
99
+
100
+ return image
101
+
102
+
103
+ def _xywh2cs(self, x, y, w, h):
104
+ center = np.zeros((2), dtype=np.float32)
105
+ center[0] = x + w * 0.5
106
+ center[1] = y + h * 0.5
107
+
108
+ if w > self.aspect_ratio * h:
109
+ h = w * 1.0 / self.aspect_ratio
110
+ elif w < self.aspect_ratio * h:
111
+ w = h * self.aspect_ratio
112
+ scale = np.array(
113
+ [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
114
+ dtype=np.float32)
115
+ if center[0] != -1:
116
+ # scale = scale * 1.25
117
+ scale = scale * 1.0
118
+
119
+ return center, scale
120
+
121
+ def _match_action_labels(self, src_action_labels, action_labels):
122
+ is_match = False
123
+ if len(src_action_labels) != len(action_labels):
124
+ return is_match
125
+ else:
126
+ exsistance = []
127
+ for new_item in src_action_labels:
128
+ exists = any(d.get("human_part") == new_item["human_part"] and d.get("partstate") == new_item["partstate"] for d in action_labels)
129
+ exsistance.append(exists)
130
+ is_match = all(exsistance)
131
+ return is_match
132
+
133
+
134
+ def _find_hoi_id(self, sources):
135
+ file_name = sources['file_name']
136
+ hoi_data = self.hoi_data[file_name]
137
+ hoi_labels = hoi_data['labels']
138
+
139
+ hoi_id = -1
140
+ src_action_labels = sources['action_labels']
141
+ for dic in hoi_labels:
142
+ action_labels = dic['action_labels']
143
+ #human_bbox = dic['human_bbox']
144
+ hoi_id = dic['hoi_id']
145
+ is_a_member = self._match_action_labels(src_action_labels=src_action_labels, action_labels=action_labels)
146
+ if is_a_member:
147
+ return hoi_id
148
+ return hoi_id
149
+
150
+
151
+
152
+ def fliplr_joints(joints, joints_vis, width, matched_parts):
153
+ """
154
+ flip coords
155
+ """
156
+ # Flip horizontal
157
+ joints[:, 0] = width - joints[:, 0] - 1
158
+
159
+ # Change left-right parts
160
+ for pair in matched_parts:
161
+ joints[pair[0], :], joints[pair[1], :] = \
162
+ joints[pair[1], :], joints[pair[0], :].copy()
163
+ joints_vis[pair[0], :], joints_vis[pair[1], :] = \
164
+ joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
165
+
166
+ return joints*joints_vis, joints_vis
167
+
168
+ def transform_preds(coords, center, scale, output_size):
169
+ target_coords = np.zeros(coords.shape)
170
+ trans = get_affine_transform(center, scale, 0, output_size, inv=1)
171
+ for p in range(coords.shape[0]):
172
+ target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
173
+ return target_coords
174
+
175
+ def get_affine_transform(
176
+ center, scale, rot, output_size,
177
+ shift=np.array([0, 0], dtype=np.float32), inv=0
178
+ ):
179
+ if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
180
+ print(scale)
181
+ scale = np.array([scale, scale])
182
+
183
+ scale_tmp = scale * 200.0
184
+ src_w = scale_tmp[0]
185
+ dst_w = output_size[0]
186
+ dst_h = output_size[1]
187
+
188
+ rot_rad = np.pi * rot / 180
189
+ src_dir = get_dir([0, src_w * -0.5], rot_rad)
190
+ dst_dir = np.array([0, dst_w * -0.5], np.float32)
191
+
192
+ src = np.zeros((3, 2), dtype=np.float32)
193
+ dst = np.zeros((3, 2), dtype=np.float32)
194
+ src[0, :] = center + scale_tmp * shift
195
+ src[1, :] = center + src_dir + scale_tmp * shift
196
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
197
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
198
+
199
+ src[2:, :] = get_3rd_point(src[0, :], src[1, :])
200
+ dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
201
+
202
+ if inv:
203
+ trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
204
+ else:
205
+ trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
206
+
207
+ return trans
208
+
209
+
210
+ def affine_transform(pt, t):
211
+ new_pt = np.array([pt[0], pt[1], 1.]).T
212
+ new_pt = np.dot(t, new_pt)
213
+ return new_pt[:2]
214
+
215
+
216
+ def get_3rd_point(a, b):
217
+ direct = a - b
218
+ return b + np.array([-direct[1], direct[0]], dtype=np.float32)
219
+
220
+
221
+ def get_dir(src_point, rot_rad):
222
+ sn, cs = np.sin(rot_rad), np.cos(rot_rad)
223
+
224
+ src_result = [0, 0]
225
+ src_result[0] = src_point[0] * cs - src_point[1] * sn
226
+ src_result[1] = src_point[0] * sn + src_point[1] * cs
227
+
228
+ return src_result
data/hicodet.py DELETED
@@ -1,294 +0,0 @@
1
- """
2
- HICODet dataset under PyTorch framework
3
-
4
- Fred Zhang <frederic.zhang@anu.edu.au>
5
-
6
- The Australian National University
7
- Australian Centre for Robotic Vision
8
- """
9
-
10
- import os
11
- import json
12
- import numpy as np
13
-
14
- from typing import Optional, List, Callable, Tuple
15
- from pocket.data import ImageDataset, DataSubset
16
-
17
- class HICODetSubset(DataSubset):
18
- def __init__(self, *args) -> None:
19
- super().__init__(*args)
20
- def filename(self, idx: int) -> str:
21
- """Override: return the image file name in the subset"""
22
- return self._filenames[self._idx[self.pool[idx]]]
23
- def image_size(self, idx: int) -> Tuple[int, int]:
24
- """Override: return the size (width, height) of an image in the subset"""
25
- return self._image_sizes[self._idx[self.pool[idx]]]
26
- @property
27
- def anno_interaction(self) -> List[int]:
28
- """Override: Number of annotated box pairs for each interaction class"""
29
- num_anno = [0 for _ in range(self.num_interation_cls)]
30
- intra_idx = [self._idx[i] for i in self.pool]
31
- for idx in intra_idx:
32
- for hoi in self._anno[idx]['hoi']:
33
- num_anno[hoi] += 1
34
- return num_anno
35
- @property
36
- def anno_object(self) -> List[int]:
37
- """Override: Number of annotated box pairs for each object class"""
38
- num_anno = [0 for _ in range(self.num_object_cls)]
39
- anno_interaction = self.anno_interaction
40
- for corr in self._class_corr:
41
- num_anno[corr[1]] += anno_interaction[corr[0]]
42
- return num_anno
43
- @property
44
- def anno_action(self) -> List[int]:
45
- """Override: Number of annotated box pairs for each action class"""
46
- num_anno = [0 for _ in range(self.num_action_cls)]
47
- anno_interaction = self.anno_interaction
48
- for corr in self._class_corr:
49
- num_anno[corr[2]] += anno_interaction[corr[0]]
50
- return num_anno
51
-
52
- class HICODet(ImageDataset):
53
- """
54
- Arguments:
55
- root(str): Root directory where images are downloaded to
56
- anno_file(str): Path to json annotation file
57
- transform(callable, optional): A function/transform that takes in an PIL image
58
- and returns a transformed version
59
- target_transform(callable, optional): A function/transform that takes in the
60
- target and transforms it
61
- transforms (callable, optional): A function/transform that takes input sample
62
- and its target as entry and returns a transformed version.
63
- """
64
- def __init__(self, root: str, anno_file: str,
65
- transform: Optional[Callable] = None,
66
- target_transform: Optional[Callable] = None,
67
- transforms: Optional[Callable] = None) -> None:
68
- super(HICODet, self).__init__(root, transform, target_transform, transforms)
69
- with open(anno_file, 'r') as f:
70
- anno = json.load(f)
71
-
72
- import pdb;pdb.set_trace()
73
- self.num_object_cls = 80
74
- self.num_interation_cls = 600
75
- self.num_action_cls = 117
76
- self._anno_file = anno_file
77
-
78
- # Load annotations
79
- self._load_annotation_and_metadata(anno)
80
-
81
- def __len__(self) -> int:
82
- """Return the number of images"""
83
- return len(self._idx)
84
-
85
- def __getitem__(self, i: int) -> tuple:
86
- """
87
- Arguments:
88
- i(int): Index to an image
89
-
90
- Returns:
91
- tuple[image, target]: By default, the tuple consists of a PIL image and a
92
- dict with the following keys:
93
- "boxes_h": list[list[4]]
94
- "boxes_o": list[list[4]]
95
- "hoi":: list[N]
96
- "verb": list[N]
97
- "object": list[N]
98
- """
99
- intra_idx = self._idx[i]
100
- return self._transforms(
101
- self.load_image(os.path.join(self._root, self._filenames[intra_idx])),
102
- self._anno[intra_idx]
103
- )
104
-
105
- def __repr__(self) -> str:
106
- """Return the executable string representation"""
107
- reprstr = self.__class__.__name__ + '(root=' + repr(self._root)
108
- reprstr += ', anno_file='
109
- reprstr += repr(self._anno_file)
110
- reprstr += ')'
111
- # Ignore the optional arguments
112
- return reprstr
113
-
114
- def __str__(self) -> str:
115
- """Return the readable string representation"""
116
- reprstr = 'Dataset: ' + self.__class__.__name__ + '\n'
117
- reprstr += '\tNumber of images: {}\n'.format(self.__len__())
118
- reprstr += '\tImage directory: {}\n'.format(self._root)
119
- reprstr += '\tAnnotation file: {}\n'.format(self._root)
120
- return reprstr
121
-
122
- @property
123
- def annotations(self) -> List[dict]:
124
- return self._anno
125
-
126
- @property
127
- def class_corr(self) -> List[Tuple[int, int, int]]:
128
- """
129
- Class correspondence matrix in zero-based index
130
- [
131
- [hoi_idx, obj_idx, verb_idx],
132
- ...
133
- ]
134
-
135
- Returns:
136
- list[list[3]]
137
- """
138
- return self._class_corr.copy()
139
-
140
- @property
141
- def object_n_verb_to_interaction(self) -> List[list]:
142
- """
143
- The interaction classes corresponding to an object-verb pair
144
-
145
- HICODet.object_n_verb_to_interaction[obj_idx][verb_idx] gives interaction class
146
- index if the pair is valid, None otherwise
147
-
148
- Returns:
149
- list[list[117]]
150
- """
151
- lut = np.full([self.num_object_cls, self.num_action_cls], None)
152
- for i, j, k in self._class_corr:
153
- lut[j, k] = i
154
- return lut.tolist()
155
-
156
- @property
157
- def object_to_interaction(self) -> List[list]:
158
- """
159
- The interaction classes that involve each object type
160
-
161
- Returns:
162
- list[list]
163
- """
164
- obj_to_int = [[] for _ in range(self.num_object_cls)]
165
- for corr in self._class_corr:
166
- obj_to_int[corr[1]].append(corr[0])
167
- return obj_to_int
168
-
169
- @property
170
- def object_to_verb(self) -> List[list]:
171
- """
172
- The valid verbs for each object type
173
-
174
- Returns:
175
- list[list]
176
- """
177
- obj_to_verb = [[] for _ in range(self.num_object_cls)]
178
- for corr in self._class_corr:
179
- obj_to_verb[corr[1]].append(corr[2])
180
- return obj_to_verb
181
-
182
- @property
183
- def anno_interaction(self) -> List[int]:
184
- """
185
- Number of annotated box pairs for each interaction class
186
-
187
- Returns:
188
- list[600]
189
- """
190
- return self._num_anno.copy()
191
-
192
- @property
193
- def anno_object(self) -> List[int]:
194
- """
195
- Number of annotated box pairs for each object class
196
-
197
- Returns:
198
- list[80]
199
- """
200
- num_anno = [0 for _ in range(self.num_object_cls)]
201
- for corr in self._class_corr:
202
- num_anno[corr[1]] += self._num_anno[corr[0]]
203
- return num_anno
204
-
205
- @property
206
- def anno_action(self) -> List[int]:
207
- """
208
- Number of annotated box pairs for each action class
209
-
210
- Returns:
211
- list[117]
212
- """
213
- num_anno = [0 for _ in range(self.num_action_cls)]
214
- for corr in self._class_corr:
215
- num_anno[corr[2]] += self._num_anno[corr[0]]
216
- return num_anno
217
-
218
- @property
219
- def objects(self) -> List[str]:
220
- """
221
- Object names
222
-
223
- Returns:
224
- list[str]
225
- """
226
- return self._objects.copy()
227
-
228
- @property
229
- def verbs(self) -> List[str]:
230
- """
231
- Verb (action) names
232
-
233
- Returns:
234
- list[str]
235
- """
236
- return self._verbs.copy()
237
-
238
- @property
239
- def interactions(self) -> List[str]:
240
- """
241
- Combination of verbs and objects
242
-
243
- Returns:
244
- list[str]
245
- """
246
- return [self._verbs[j] + ' ' + self.objects[i]
247
- for _, i, j in self._class_corr]
248
-
249
- def split(self, ratio: float) -> Tuple[HICODetSubset, HICODetSubset]:
250
- """
251
- Split the dataset according to given ratio
252
-
253
- Arguments:
254
- ratio(float): The percentage of training set between 0 and 1
255
- Returns:
256
- train(Dataset)
257
- val(Dataset)
258
- """
259
- perm = np.random.permutation(len(self._idx))
260
- n = int(len(perm) * ratio)
261
- return HICODetSubset(self, perm[:n]), HICODetSubset(self, perm[n:])
262
-
263
- def filename(self, idx: int) -> str:
264
- """Return the image file name given the index"""
265
- return self._filenames[self._idx[idx]]
266
-
267
- def image_size(self, idx: int) -> Tuple[int, int]:
268
- """Return the size (width, height) of an image"""
269
- return self._image_sizes[self._idx[idx]]
270
-
271
- def _load_annotation_and_metadata(self, f: dict) -> None:
272
- """
273
- Arguments:
274
- f(dict): Dictionary loaded from {anno_file}.json
275
- """
276
- idx = list(range(len(f['filenames'])))
277
- for empty_idx in f['empty']:
278
- idx.remove(empty_idx)
279
-
280
- num_anno = [0 for _ in range(self.num_interation_cls)]
281
- for anno in f['annotation']:
282
- for hoi in anno['hoi']:
283
- num_anno[hoi] += 1
284
-
285
- self._idx = idx
286
- self._num_anno = num_anno
287
-
288
- self._anno = f['annotation']
289
- self._filenames = f['filenames']
290
- self._image_sizes = f['size']
291
- self._class_corr = f['correspondence']
292
- self._empty_idx = f['empty']
293
- self._objects = f['objects']
294
- self._verbs = f['verbs']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/pose_coco.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ from torch.utils.data import Dataset
3
+ from torchvision import transforms
4
+ import torch
5
+ import logging
6
+ import random
7
+ from typing import Dict
8
+ import os
9
+ import numpy as np
10
+ from pycocotools.coco import COCO
11
+ import cv2
12
+
13
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
14
+ PREFIX_IMAGE = "Image: "
15
+ PREFIX_NO_IMAGE = "Image: N/A"
16
+ BEGIN_DESCRIPTION = "<des>"
17
+ END_DESCRIPTION = "</des>"
18
+ IGNORE_INDEX = -100
19
+ DEFAULT_EOS_TOKEN = "</s>"
20
+
21
+ from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion
22
+
23
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
24
+ PREFIX_IMAGE = "Image: "
25
+ PREFIX_NO_IMAGE = "Image: N/A"
26
+ BEGIN_DESCRIPTION = "<des>"
27
+ END_DESCRIPTION = "</des>"
28
+ IGNORE_INDEX = -100
29
+ DEFAULT_EOS_TOKEN = "</s>"
30
+ BEGIN_OPTIONS = "<opt>"
31
+ END_OPTIONS = "</opt>"
32
+ BEGIN_LOC = "<loc>"
33
+ END_LOC = "</loc>"
34
+ BEGIN_QUESTION = "<qes>"
35
+ END_QUESTION = "</qes>"
36
+
37
+ class PoseCOCODataset(Dataset):
38
+ """Dataset for supervised fine-tuning."""
39
+ def __init__(self, data_path: str,
40
+ multimodal_cfg: dict,
41
+ is_train=True,
42
+ is_RL=False
43
+ ):
44
+ super(PoseCOCODataset, self).__init__()
45
+ logging.warning("Loading data...")
46
+ self.size = multimodal_cfg['image_size']
47
+ self.aspect_ratio = 1.0
48
+ self.pixel_std = 200
49
+ self.num_joints = 17
50
+
51
+ coco = COCO(data_path)
52
+ list_data_dict = []
53
+ instance_id = 0
54
+ for index in coco.getImgIds():
55
+ im_ann = coco.loadImgs(index)[0]
56
+ width = im_ann['width']
57
+ height = im_ann['height']
58
+ annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
59
+ objs = coco.loadAnns(annIds)
60
+ # sanitize bboxes
61
+ valid_objs = []
62
+ for obj in objs:
63
+ x, y, w, h = obj['bbox']
64
+ x1 = np.max((0, x))
65
+ y1 = np.max((0, y))
66
+ x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
67
+ y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
68
+ if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
69
+ obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1]
70
+ valid_objs.append(obj)
71
+ objs = valid_objs
72
+
73
+ for obj in objs:
74
+ cls = obj['category_id']
75
+ if cls != 1: continue
76
+
77
+ # ignore objs without keypoints annotation
78
+ if max(obj['keypoints']) == 0:
79
+ continue
80
+
81
+ joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32)
82
+ joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float32)
83
+ visible = np.zeros((self.num_joints), dtype=np.float32)
84
+ for ipt in range(self.num_joints):
85
+ joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
86
+ joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
87
+ joints_3d[ipt, 2] = 0
88
+ t_vis = obj['keypoints'][ipt * 3 + 2]
89
+ visible[ipt] = t_vis
90
+ if t_vis > 1:
91
+ t_vis = 1
92
+ joints_3d_vis[ipt, 0] = t_vis
93
+ joints_3d_vis[ipt, 1] = t_vis
94
+ joints_3d_vis[ipt, 2] = 0
95
+
96
+ center, scale = self._box2cs(obj['clean_bbox'][:4])
97
+ list_data_dict.append({
98
+ 'file_name': im_ann['file_name'],
99
+ 'image_id': index,
100
+ 'center': center,
101
+ 'scale': scale,
102
+ 'joints_3d': joints_3d,
103
+ 'joints_3d_vis': joints_3d_vis,
104
+ 'instance_id': instance_id,
105
+ 'human_bbox': obj['clean_bbox']
106
+ })
107
+ instance_id += 1
108
+
109
+ logging.warning("The number of training samples is {}".format(len(list_data_dict)))
110
+ logging.warning("Formatting inputs...Skip in lazy mode")
111
+ self.list_data_dict = list_data_dict
112
+ self.multimodal_cfg = multimodal_cfg
113
+
114
+ self.data_aug = False
115
+ self.is_train = is_train
116
+
117
+ def __len__(self):
118
+ return len(self.list_data_dict)
119
+
120
+ def __getitem__(self, i):
121
+ return self._parse_data_item_val(i)
122
+
123
+ def _parse_data_item_val(self, i):
124
+ sources = self.list_data_dict[i]
125
+ result_dict = {}
126
+ image, joints, joints_vis, c, s, file_name, image_size = self._get_pose_item(sources)
127
+ image_id = sources['image_id']
128
+ result_dict['image'] = image
129
+ result_dict['image_id'] = image_id
130
+ result_dict['c'] = c
131
+ result_dict['s'] = s
132
+ result_dict['joints'] = joints
133
+ result_dict['joints_vis'] = joints_vis
134
+ result_dict['file_name'] = file_name
135
+ result_dict['human_bbox'] = sources['human_bbox']
136
+ result_dict['image_size'] = image_size
137
+ return result_dict
138
+
139
+ def _get_pose_item(self, sources):
140
+ file_name = sources['file_name']
141
+ image_folder = self.multimodal_cfg['image_folder']
142
+ image_file = os.path.join(image_folder, file_name)
143
+ image = cv2.imread(
144
+ image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
145
+ )
146
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
147
+ h, w, c = image.shape
148
+ # process image
149
+ joints = sources['joints_3d']
150
+ joints_vis = sources['joints_3d_vis']
151
+ c = sources['center']
152
+ s = sources['scale']
153
+ r = 0
154
+
155
+ if self.data_aug:
156
+ sf = 0.3
157
+ rf = 40
158
+ s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
159
+ r = random.uniform(-rf, rf) if random.random() <= 0.5 else 0
160
+
161
+ flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
162
+ # flip
163
+ if random.random() <= 0.5:
164
+ image = image[:, ::-1, :]
165
+ joints, joints_vis = fliplr_joints(
166
+ joints, joints_vis, image.shape[1], flip_pairs)
167
+ c[0] = image.shape[1] - c[0] - 1
168
+
169
+ trans = get_affine_transform(c, s, r, (int(self.size), int(self.size)))
170
+ image = cv2.warpAffine(
171
+ image,
172
+ trans,
173
+ (int(self.size), int(self.size)),
174
+ flags=cv2.INTER_LINEAR)
175
+
176
+ for i in range(self.num_joints):
177
+ if joints_vis[i, 0] > 0.0:
178
+ joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
179
+
180
+ return image, joints, joints_vis, c, s, file_name, [h,w]
181
+
182
+ def _box2cs(self, box):
183
+ x, y, w, h = box[:4]
184
+ return self._xywh2cs(x, y, w, h)
185
+
186
+ def _xywh2cs(self, x, y, w, h):
187
+ center = np.zeros((2), dtype=np.float32)
188
+ center[0] = x + w * 0.5
189
+ center[1] = y + h * 0.5
190
+
191
+ if w > self.aspect_ratio * h:
192
+ h = w * 1.0 / self.aspect_ratio
193
+ elif w < self.aspect_ratio * h:
194
+ w = h * self.aspect_ratio
195
+ scale = np.array(
196
+ [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
197
+ dtype=np.float32)
198
+ if center[0] != -1:
199
+ scale = scale * 1.0
200
+
201
+ return center, scale
202
+
203
+ def fliplr_joints(joints, joints_vis, width, matched_parts):
204
+ """
205
+ flip coords
206
+ """
207
+ # Flip horizontal
208
+ joints[:, 0] = width - joints[:, 0] - 1
209
+
210
+ # Change left-right parts
211
+ for pair in matched_parts:
212
+ joints[pair[0], :], joints[pair[1], :] = \
213
+ joints[pair[1], :], joints[pair[0], :].copy()
214
+ joints_vis[pair[0], :], joints_vis[pair[1], :] = \
215
+ joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
216
+
217
+ return joints*joints_vis, joints_vis
218
+
219
+ def transform_preds(coords, center, scale, output_size):
220
+ target_coords = np.zeros(coords.shape)
221
+ trans = get_affine_transform(center, scale, 0, output_size, inv=1)
222
+ for p in range(coords.shape[0]):
223
+ target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
224
+ return target_coords
225
+
226
+ def get_affine_transform(
227
+ center, scale, rot, output_size,
228
+ shift=np.array([0, 0], dtype=np.float32), inv=0
229
+ ):
230
+ if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
231
+ print(scale)
232
+ scale = np.array([scale, scale])
233
+
234
+ scale_tmp = scale * 200.0
235
+ src_w = scale_tmp[0]
236
+ dst_w = output_size[0]
237
+ dst_h = output_size[1]
238
+
239
+ rot_rad = np.pi * rot / 180
240
+ src_dir = get_dir([0, src_w * -0.5], rot_rad)
241
+ dst_dir = np.array([0, dst_w * -0.5], np.float32)
242
+
243
+ src = np.zeros((3, 2), dtype=np.float32)
244
+ dst = np.zeros((3, 2), dtype=np.float32)
245
+ src[0, :] = center + scale_tmp * shift
246
+ src[1, :] = center + src_dir + scale_tmp * shift
247
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
248
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
249
+
250
+ src[2:, :] = get_3rd_point(src[0, :], src[1, :])
251
+ dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
252
+
253
+ if inv:
254
+ trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
255
+ else:
256
+ trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
257
+
258
+ return trans
259
+
260
+
261
+ def affine_transform(pt, t):
262
+ new_pt = np.array([pt[0], pt[1], 1.]).T
263
+ new_pt = np.dot(t, new_pt)
264
+ return new_pt[:2]
265
+
266
+
267
+ def get_3rd_point(a, b):
268
+ direct = a - b
269
+ return b + np.array([-direct[1], direct[0]], dtype=np.float32)
270
+
271
+
272
+ def get_dir(src_point, rot_rad):
273
+ sn, cs = np.sin(rot_rad), np.cos(rot_rad)
274
+
275
+ src_result = [0, 0]
276
+ src_result[0] = src_point[0] * cs - src_point[1] * sn
277
+ src_result[1] = src_point[0] * sn + src_point[1] * cs
278
+
279
+ return src_result