|
""" |
|
Dataset for clip model |
|
""" |
|
import logging |
|
import torch |
|
from torch.utils.data import Dataset |
|
import numpy as np |
|
import h5py |
|
import math |
|
import random |
|
from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array |
|
from utils.tensor_utils import pad_sequences_1d |
|
from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface |
|
from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \ |
|
get_didemo_agreed_ts |
|
from standalone_eval.eval import compute_temporal_iou_batch |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class ProposalRetrievalDataset(Dataset): |
|
""" |
|
Args: |
|
dset_name, str, ["tvr"] |
|
ctx_mode: str, |
|
pos_iou_thd: float, in [0, 1], >= pos_iou_thd are defined as positive |
|
neg_iou_thd: float, in [0, 1], < neg_iou_thd are defined as negative |
|
Return: |
|
a dict: { |
|
"meta": { |
|
"desc_id": int, |
|
"desc": str, |
|
"vid_name": str, |
|
"duration": float, |
|
"ts": [st (float), ed (float)], seconds, ground_truth timestamps |
|
"pos_moment": [st (float), ed (float)], seconds, IoU with "ts" >= pos_iou_thd |
|
"intra_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd |
|
"inter_neg_vid_name": str, |
|
"inter_neg_duration": float, |
|
"inter_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd |
|
} |
|
"model_inputs": { |
|
"desc_feat": torch.tensor, (L, D_t) |
|
"pos_moment_feat": torch.tensor, (n_clip_in_moment, D) |
|
"intra_neg_moment_feat": torch.tensor, (n_clip_in_moment, D) |
|
"inter_neg_moment_feat": torch.tensor, (n_clip_in_moment, D) |
|
} |
|
} |
|
""" |
|
def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len, |
|
vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef", |
|
pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0, |
|
normalize_vfeat=True, normalize_tfeat=True, model_type="cal", |
|
external_train_vr_res_path=None, corpus_path=None): |
|
self.dset_name = dset_name |
|
self.model_type = model_type |
|
self.pool_local = model_type == "mcn" |
|
self.data_path = data_path |
|
self.data_ratio = data_ratio |
|
|
|
self.desc_bert_path = desc_bert_path |
|
self.max_desc_len = max_desc_len |
|
self.sub_bert_path = sub_bert_path |
|
|
|
self.vid_feat_path = vid_feat_path |
|
self.clip_length = clip_length |
|
self.ctx_mode = ctx_mode |
|
|
|
self.pos_iou_thd = pos_iou_thd |
|
self.neg_iou_thd = neg_iou_thd |
|
|
|
self.vid_feat_output_size = 2 * vid_feat_size * ("video" in ctx_mode) + 2 * ("tef" in ctx_mode) |
|
self.sub_feat_output_size = 2 * sub_feat_size * ("sub" in ctx_mode) + 2 * ("tef" in ctx_mode) |
|
|
|
|
|
self.data = load_jsonl(data_path) |
|
if self.data_ratio != 1: |
|
n_examples = int(len(self.data) * data_ratio) |
|
self.data = self.data[:n_examples] |
|
logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) |
|
|
|
self.proposal_fn = get_proposal_interface(dset_name) |
|
if self.ctx_mode != "tef": |
|
self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver) |
|
self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver) |
|
if "sub" in self.ctx_mode: |
|
self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver) |
|
self.normalize_vfeat = normalize_vfeat |
|
self.normalize_tfeat = normalize_tfeat |
|
self.use_video = "video" in self.ctx_mode |
|
self.use_sub = "sub" in self.ctx_mode |
|
self.use_tef = "tef" in self.ctx_mode |
|
|
|
if external_train_vr_res_path is not None: |
|
video_data = load_json(corpus_path)["train"] |
|
|
|
video_idx2name_dur_pair = {v[1]: [k, v[0]] for k, v in video_data.items()} |
|
external_vr_res = load_json(external_train_vr_res_path) |
|
|
|
self.desc_id2video_names_dur_pairs = \ |
|
{e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]] |
|
for e in external_vr_res["VR"]} |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, index): |
|
raw_data = self.data[index] |
|
|
|
|
|
meta = dict( |
|
desc_id=raw_data["desc_id"], |
|
desc=raw_data["desc"], |
|
vid_name=raw_data["vid_name"], |
|
duration=raw_data["duration"], |
|
ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]), |
|
) |
|
model_inputs = dict() |
|
query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len] |
|
if self.normalize_tfeat: |
|
query_feat = l2_normalize_np_array(query_feat) |
|
model_inputs["query_feat"] = torch.from_numpy(query_feat) |
|
|
|
|
|
meta["pos_moment"] = self.align_ts_to_clip_boundaries(meta["duration"], meta["ts"]) |
|
meta["intra_neg_moment"] = self.sample_intra_neg_moment(meta["duration"], meta["ts"]) |
|
meta["inter_neg_moment"], meta["inter_neg_vid_name"], meta["inter_neg_duration"] = \ |
|
self.sample_inter_video_negative(meta["vid_name"], meta["pos_moment"] / meta["duration"], |
|
desc_id=meta["desc_id"]) |
|
|
|
pos_tef, intra_neg_tef, inter_neg_tef = (None,) * 3 |
|
if self.use_tef: |
|
pos_tef = meta["pos_moment"] / meta["duration"] |
|
intra_neg_tef = meta["intra_neg_moment"] / meta["duration"] |
|
inter_neg_tef = meta["inter_neg_moment"] / meta["inter_neg_duration"] |
|
|
|
if self.use_video: |
|
pos_v_feat = self.vid_feat_h5[meta["vid_name"]] |
|
neg_v_feat = self.vid_feat_h5[meta["inter_neg_vid_name"]] |
|
pos_v_ctx_feat = np.mean(pos_v_feat, axis=0) |
|
neg_v_ctx_feat = np.mean(neg_v_feat, axis=0) |
|
if self.normalize_vfeat: |
|
pos_v_ctx_feat = l2_normalize_np_array(pos_v_ctx_feat) |
|
neg_v_ctx_feat = l2_normalize_np_array(neg_v_ctx_feat) |
|
pos_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["pos_moment"], |
|
normalize=self.normalize_vfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
intra_neg_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["intra_neg_moment"], |
|
normalize=self.normalize_vfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
inter_neg_moment_v_feat = self.get_moment_feat(neg_v_feat, meta["inter_neg_moment"], |
|
normalize=self.normalize_vfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
|
|
|
|
model_inputs["pos_moment_video_feat"] = self.concat_feat_adv( |
|
moment_feats=[pos_moment_v_feat, pos_v_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["intra_neg_moment_video_feat"] = self.concat_feat_adv( |
|
moment_feats=[intra_neg_moment_v_feat, pos_v_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["inter_neg_moment_video_feat"] = self.concat_feat_adv( |
|
moment_feats=[inter_neg_moment_v_feat, neg_v_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode) |
|
else: |
|
for k in ["pos_moment_video_feat", "intra_neg_moment_video_feat", "inter_neg_moment_video_feat"]: |
|
model_inputs[k] = torch.zeros((2, 2)) |
|
|
|
if self.use_sub: |
|
pos_s_feat = self.sub_bert_h5[meta["vid_name"]] |
|
neg_s_feat = self.sub_bert_h5[meta["inter_neg_vid_name"]] |
|
pos_s_ctx_feat = np.mean(pos_s_feat, axis=0) |
|
neg_s_ctx_feat = np.mean(neg_s_feat, axis=0) |
|
if self.normalize_tfeat: |
|
pos_s_ctx_feat = l2_normalize_np_array(pos_s_ctx_feat) |
|
neg_s_ctx_feat = l2_normalize_np_array(neg_s_ctx_feat) |
|
pos_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["pos_moment"], |
|
normalize=self.normalize_tfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
intra_neg_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["intra_neg_moment"], |
|
normalize=self.normalize_tfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
inter_neg_moment_s_feat = self.get_moment_feat(neg_s_feat, meta["inter_neg_moment"], |
|
normalize=self.normalize_tfeat, |
|
fix_outbound=True, pool_local=self.pool_local) |
|
|
|
|
|
model_inputs["pos_moment_sub_feat"] = self.concat_feat_adv( |
|
moment_feats=[pos_moment_s_feat, pos_s_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["intra_neg_moment_sub_feat"] = self.concat_feat_adv( |
|
moment_feats=[intra_neg_moment_s_feat, pos_s_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["inter_neg_moment_sub_feat"] = self.concat_feat_adv( |
|
moment_feats=[inter_neg_moment_s_feat, neg_s_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode) |
|
else: |
|
for k in ["pos_moment_sub_feat", "intra_neg_moment_sub_feat", "inter_neg_moment_sub_feat"]: |
|
model_inputs[k] = torch.zeros((2, 2)) |
|
|
|
if not self.use_sub and not self.use_video and self.use_tef: |
|
model_inputs["pos_moment_video_feat"] = \ |
|
self.concat_feat_adv(tef=pos_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["intra_neg_moment_video_feat"] = \ |
|
self.concat_feat_adv(tef=intra_neg_tef, ctx_mode=self.ctx_mode) |
|
model_inputs["inter_neg_moment_video_feat"] = \ |
|
self.concat_feat_adv(tef=inter_neg_tef, ctx_mode=self.ctx_mode) |
|
return dict(meta=meta, model_inputs=model_inputs) |
|
|
|
def align_ts_to_clip_boundaries(self, duration, ts): |
|
""" # TODO Do we really need this??? |
|
Generate a moment [st, ed] that is most close to a clip boundary, |
|
st and ed must be a multiple of self.clip_length, and ed <= duration |
|
duration: float, |
|
ts: [st (float), ed (float)], ground_truth ts |
|
""" |
|
clip_aligned_ts = np.array([math.floor(ts[0] / self.clip_length), |
|
math.ceil(ts[1] / self.clip_length)]) * self.clip_length |
|
clip_aligned_ts[1] = min(clip_aligned_ts[1], duration) |
|
return clip_aligned_ts |
|
|
|
def sample_intra_neg_moment(self, duration, ts): |
|
""" Generate a intra negative moment given the video duration and the GT ts. |
|
The returned moment will be aligned to clip boundaries. |
|
1) neg_moment has at least 2 clips |
|
2) its iou with ts should be < self.neg_iou_thd |
|
Args: |
|
duration: float |
|
ts: [st (float), ed (float)], ground_truth ts |
|
|
|
Returns: |
|
|
|
""" |
|
max_n_search = 5 |
|
sampled_moments = self.sample_ts_at_clip_boundaries(duration, n_pairs=max_n_search) |
|
sampled_moments_ious = compute_temporal_iou_batch(sampled_moments, ts) |
|
smallest_iou_idx = np.argmin(sampled_moments_ious) |
|
sampled_moment = sampled_moments[smallest_iou_idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
return sampled_moment |
|
|
|
def sample_ts_at_clip_boundaries(self, duration, n_pairs=1): |
|
"""sample n_pairs moment at clip boundaries, each has at least two clips.""" |
|
|
|
moments = np.random.randint(0, np.ceil(duration / self.clip_length), size=(n_pairs, 2)) |
|
moments = np.sort(moments, axis=1) * self.clip_length |
|
less_equal = moments[:, 1] - moments[:, 0] <= self.clip_length |
|
start_zero = moments[:, 0] == 0 |
|
moments[:, 1][less_equal * start_zero] += self.clip_length |
|
moments[:, 0][less_equal * (start_zero == False)] -= self.clip_length |
|
return moments |
|
|
|
def sample_inter_video_negative(self, pos_vid_name, normalized_pos_moment, desc_id=None): |
|
"""Sample a negative moment --> negative video + similar normalized moment. |
|
1) they are not from the same video |
|
Args: |
|
pos_vid_name: str, |
|
normalized_pos_moment: np.ndarray, (2, ), value in [0, 1], normalized by duration. |
|
desc_id: str |
|
Returns: |
|
moment: np.ndarray, (2, ), ts aligned to clip boundaries. |
|
|
|
""" |
|
use_guided_negative = hasattr(self, "desc_id2video_names_dur_pairs") |
|
if use_guided_negative: |
|
top_videos = self.desc_id2video_names_dur_pairs[desc_id] |
|
max_idx = len(top_videos) - 1 |
|
|
|
while True: |
|
if use_guided_negative: |
|
sampled_idx = min(max_idx, int(random.expovariate(0.1))) |
|
sampled_video_name, sampled_video_dur = top_videos[sampled_idx] |
|
else: |
|
neg_vid_data = self.data[int(random.random() * len(self))] |
|
sampled_video_name, sampled_video_dur = neg_vid_data["vid_name"], neg_vid_data["duration"] |
|
if sampled_video_name != pos_vid_name: |
|
inter_neg_moment = self.align_ts_to_clip_boundaries( |
|
sampled_video_dur, sampled_video_dur * normalized_pos_moment) |
|
break |
|
|
|
return inter_neg_moment, sampled_video_name, sampled_video_dur |
|
|
|
@classmethod |
|
def get_clip_indices_from_moments(cls, moment, clip_length): |
|
clip_st_ed_indices = moment / clip_length |
|
return math.floor(clip_st_ed_indices[0]), math.ceil(clip_st_ed_indices[1]) |
|
|
|
def get_moment_feat(self, vid_feat, moment, normalize=True, fix_outbound=False, pool_local=False): |
|
"""Each moment contains multiple clips. |
|
Inside means [moment[0], moment[1]] (seconds) |
|
Args: |
|
vid_feat: np.ndarray, (N_clips, D) |
|
moment: [st (float), ed (float)], np.ndarray |
|
normalize: L2 normalize features |
|
fix_outbound: bool, |
|
pool_local: whether to mean pool the features |
|
Returns: |
|
moment_feature: np.ndarray, ((moment[1] - moment[0]) / clip_length, D) or (D, ) |
|
""" |
|
clip_st_idx, clip_ed_idx = self.get_clip_indices_from_moments(moment, self.clip_length) |
|
if fix_outbound: |
|
vid_feat_len = len(vid_feat) |
|
if clip_st_idx >= vid_feat_len: |
|
clip_st_idx = vid_feat_len - 2 |
|
moment_feat = vid_feat[clip_st_idx:clip_ed_idx] |
|
if pool_local: |
|
moment_feat = np.mean(moment_feat, axis=0, keepdims=True) |
|
if normalize: |
|
moment_feat = l2_normalize_np_array(moment_feat) |
|
return moment_feat |
|
|
|
@classmethod |
|
def concat_feat_adv(cls, moment_feats=None, tef=None, to_torch=True, ctx_mode="tef"): |
|
""" Concat moment_feat with other_feats and tef. All the features should be L2 normalized before concatenating |
|
Args: |
|
moment_feats: list of feats, one of them might be None. Other possible values are |
|
ctx_feat (D, ) or sub(vid)_moment_feat (N_p, N_clips, D_t) or (N_clips, D_t). |
|
The first non-None feature array is used as base for the rest to concatenate with. |
|
tef: (N_p, 2) or (2, ), np.ndarray |
|
to_torch: convert resulting np.ndarray to torch.tensor |
|
ctx_mode: |
|
""" |
|
if ctx_mode == "tef": |
|
assembled_feat = np.expand_dims(tef, axis=-2) |
|
else: |
|
moment_feats = [e for e in moment_feats if e is not None] |
|
extra_dims = moment_feats[0].shape[:-1] |
|
if isinstance(extra_dims, int): |
|
extra_dims = (extra_dims, ) |
|
last_dim_lengths = [0, ] + [e.shape[-1] for e in moment_feats] |
|
if "tef" in ctx_mode: |
|
last_dim_lengths += [2, ] |
|
moment_feats += [np.expand_dims(tef, axis=-2), ] |
|
|
|
if len(moment_feats) > 1: |
|
assembled_feat = np.empty(extra_dims + (sum(last_dim_lengths), ), dtype=np.float32) |
|
last_dim_lengths_cumsum = [sum(last_dim_lengths[0:idx+1]) for idx in range(len(last_dim_lengths))] |
|
for idx, feat in enumerate(moment_feats): |
|
assembled_feat[..., last_dim_lengths_cumsum[idx]:last_dim_lengths_cumsum[idx+1]] = feat |
|
else: |
|
assembled_feat = moment_feats[0] |
|
|
|
if to_torch: |
|
return torch.from_numpy(assembled_feat) |
|
else: |
|
return assembled_feat |
|
|
|
|
|
class ProposalRetrievalEvalDataset(Dataset): |
|
""" |
|
init_data_mode: `video_query` or `video_only` or `query_only`, |
|
it indicates which data to load when initialize the Dataset object. |
|
data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() |
|
desc_bert_path_or_handler: h5py.File object or str path |
|
vid_feat_path_or_handler: h5py.File object or str path |
|
eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with |
|
max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. |
|
load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. |
|
data_ratio: percentage of query data to use. |
|
""" |
|
def __init__(self, dset_name, eval_split_name, data_path=None, |
|
desc_bert_path_or_handler=None, max_desc_len=None, |
|
sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, |
|
corpus_path=None, clip_length=None, |
|
eval_proposal_bsz=None, ctx_mode="tef", data_mode="context", |
|
h5driver=None, data_ratio=1.0, normalize_vfeat=True, |
|
normalize_tfeat=True, max_n_proposals=90, model_type="cal"): |
|
self.dset_name = dset_name |
|
self.model_type = model_type |
|
self.pool_local = model_type == "mcn" |
|
self.eval_split_name = eval_split_name |
|
self.ctx_mode = ctx_mode |
|
self.load_gt_video = False |
|
self.data_ratio = data_ratio |
|
self.normalize_vfeat = normalize_vfeat |
|
self.normalize_tfeat = normalize_tfeat |
|
self.max_n_proposals = max_n_proposals |
|
|
|
self.data_mode = None |
|
self.set_data_mode(data_mode) |
|
|
|
self.max_desc_len = max_desc_len |
|
self.data_path = data_path |
|
self.query_data = load_jsonl(data_path) |
|
if data_ratio != 1: |
|
n_examples = int(len(self.query_data) * data_ratio) |
|
self.query_data = self.query_data[:n_examples] |
|
logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) |
|
if isinstance(desc_bert_path_or_handler, h5py.File): |
|
self.desc_bert_h5 = desc_bert_path_or_handler |
|
else: |
|
self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) |
|
|
|
video_data = load_json(corpus_path)[self.eval_split_name] |
|
self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] |
|
self.video2idx = {k: v[1] for k, v in video_data.items()} |
|
self.eval_proposal_bsz = eval_proposal_bsz |
|
self.clip_length = clip_length |
|
self.proposal_fn = get_proposal_interface(dset_name) |
|
|
|
self.use_video = "video" in self.ctx_mode |
|
self.use_sub = "sub" in self.ctx_mode |
|
self.use_tef = "tef" in self.ctx_mode |
|
|
|
if self.use_video: |
|
if isinstance(vid_feat_path_or_handler, h5py.File): |
|
self.vid_feat_h5 = vid_feat_path_or_handler |
|
else: |
|
self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) |
|
|
|
if self.use_sub: |
|
if isinstance(sub_bert_path_or_handler, h5py.File): |
|
self.sub_bert_h5 = sub_bert_path_or_handler |
|
else: |
|
self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) |
|
|
|
def set_data_mode(self, data_mode): |
|
"""context or query""" |
|
assert data_mode in ["context", "query"] |
|
self.data_mode = data_mode |
|
|
|
def load_gt_vid_name_for_query(self, load_gt_video): |
|
"""load_gt_video: bool, affect the returned value of self._get_item_query""" |
|
assert "vid_name" in self.query_data[0] |
|
self.load_gt_video = load_gt_video |
|
|
|
def __len__(self): |
|
if self.data_mode == "context": |
|
return len(self.video_data) |
|
else: |
|
return len(self.query_data) |
|
|
|
def __getitem__(self, index): |
|
if self.data_mode == "context": |
|
return self._get_item_context(index) |
|
else: |
|
return self._get_item_query(index) |
|
|
|
def _get_item_query(self, index): |
|
"""Need to batch""" |
|
raw_data = self.query_data[index] |
|
|
|
meta = dict( |
|
desc_id=raw_data["desc_id"], |
|
desc=raw_data["desc"], |
|
vid_name=raw_data["vid_name"] if self.load_gt_video else None |
|
) |
|
|
|
model_inputs = dict() |
|
query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len] |
|
if self.normalize_tfeat: |
|
query_feat = l2_normalize_np_array(query_feat) |
|
model_inputs["query_feat"] = torch.from_numpy(query_feat) |
|
return dict(meta=meta, model_inputs=model_inputs) |
|
|
|
def _get_item_context(self, index): |
|
"""No need to batch, since it has already been batched here""" |
|
raw_data = self.video_data[index] |
|
|
|
|
|
proposals = self.proposal_fn( |
|
video_id="", metadata={"duration": raw_data["duration"]}) |
|
proposals_lengths = proposals[:, 1] - proposals[:, 0] |
|
sorted_proposal_indices = np.argsort(proposals_lengths)[:self.max_n_proposals] |
|
sorted_proposals = proposals[sorted_proposal_indices] |
|
|
|
|
|
meta = dict( |
|
vid_name=raw_data["vid_name"], |
|
duration=raw_data["duration"], |
|
proposals=sorted_proposals |
|
) |
|
model_inputs = dict() |
|
|
|
n_proposal_batches = math.ceil(1.0 * len(sorted_proposals) / self.eval_proposal_bsz) |
|
|
|
tef_batched_list = [None, ] * n_proposal_batches |
|
t_moments_mask_list = [None, ] * n_proposal_batches |
|
if self.use_tef: |
|
tef_array = sorted_proposals / meta["duration"] |
|
for batch_idx in range(n_proposal_batches): |
|
st_m_idx = batch_idx * self.eval_proposal_bsz |
|
ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz |
|
tef_batched_list[batch_idx] = tef_array[st_m_idx:ed_m_idx] |
|
t_moments_mask_list[batch_idx] = \ |
|
np.ones((len(tef_batched_list[batch_idx]), 1), dtype=np.float32) |
|
if not self.use_video and not self.use_sub: |
|
model_inputs["video_moment_features_list"] = [ |
|
ProposalRetrievalDataset.concat_feat_adv(tef=t, ctx_mode=self.ctx_mode) for t in tef_batched_list] |
|
model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in t_moments_mask_list] |
|
|
|
|
|
if self.use_video: |
|
v_feat = self.vid_feat_h5[meta["vid_name"]] |
|
v_ctx_feat = np.mean(v_feat, axis=0) |
|
if self.normalize_vfeat: |
|
v_ctx_feat = l2_normalize_np_array(v_ctx_feat) |
|
v_padded_moments_features_list, v_moments_mask_list = \ |
|
self.get_batched_moment_feat_for_all_proposals(v_feat, sorted_proposals, |
|
pool_local=self.pool_local, |
|
normalize=self.normalize_vfeat) |
|
|
|
model_inputs["video_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv( |
|
moment_feats=[v, v_ctx_feat], tef=t, ctx_mode=self.ctx_mode) |
|
for v, t in zip(v_padded_moments_features_list, tef_batched_list)] |
|
model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in v_moments_mask_list] |
|
|
|
if self.use_sub: |
|
s_feat = self.sub_bert_h5[meta["vid_name"]] |
|
s_ctx_feat = np.mean(s_feat, axis=0) |
|
if self.normalize_tfeat: |
|
s_ctx_feat = l2_normalize_np_array(s_ctx_feat) |
|
s_padded_moments_features_list, s_moments_mask_list = \ |
|
self.get_batched_moment_feat_for_all_proposals(s_feat, sorted_proposals, |
|
pool_local=self.pool_local, |
|
normalize=self.normalize_tfeat) |
|
model_inputs["sub_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv( |
|
moment_feats=[s, s_ctx_feat], tef=t, ctx_mode=self.ctx_mode) |
|
for s, t in zip(s_padded_moments_features_list, tef_batched_list)] |
|
model_inputs["sub_moment_mask_list"] = [torch.from_numpy(e) for e in s_moments_mask_list] |
|
return dict(meta=meta, model_inputs=model_inputs) |
|
|
|
def get_batched_moment_feat_for_all_proposals(self, feature, moments, pool_local=False, normalize=True): |
|
"""proposals of the same video wil be segmented into multiple batches to accomodate GPU memory |
|
pool_local: pool local feature into a single vector |
|
""" |
|
n_proposal_batches = math.ceil(1.0 * len(moments) / self.eval_proposal_bsz) |
|
padded_moments_features_list = [None, ] * n_proposal_batches |
|
moments_mask_list = [None, ] * n_proposal_batches |
|
moments_features = self.get_moment_feat_for_all_proposals( |
|
feature, moments, normalize=normalize, pool_local=pool_local) |
|
for batch_idx in range(n_proposal_batches): |
|
st_m_idx = batch_idx * self.eval_proposal_bsz |
|
ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz |
|
padded_moments_features, moments_mask = \ |
|
pad_sequences_1d(moments_features[st_m_idx:ed_m_idx], dtype=np.float32) |
|
padded_moments_features_list[batch_idx] = padded_moments_features |
|
moments_mask_list[batch_idx] = moments_mask |
|
assert np.sum(np.sum(moments_mask, axis=1) == 0) == 0, " err {}".format(moments_mask) |
|
assert np.sum(np.sum(moments_mask_list[0], axis=1) == 0) == 0, " err {}".format(moments_mask_list) |
|
return padded_moments_features_list, moments_mask_list |
|
|
|
def get_moment_feat_for_all_proposals(self, vid_feat, moments, normalize=True, pool_local=False): |
|
"""Each moment is comprised of multiple clips |
|
Args: |
|
vid_feat: np.ndarray, (N_clips, D) |
|
moments: np.ndarray, (N_p, 2), each row is [st (float), ed (float)], |
|
normalize: L2 normalize |
|
pool_local: |
|
Returns: |
|
moments_features: list(np.ndarray), [(N_clips, D), ] * N_p, N_clips is changing. |
|
""" |
|
if normalize and not pool_local: |
|
vid_feat = l2_normalize_np_array(vid_feat) |
|
vid_feat_len = len(vid_feat) |
|
moments_st_clip_indices = np.floor(moments[:, 0] / self.clip_length).astype(np.int64).clip(0, vid_feat_len-1) |
|
moments_ed_clip_indices = np.ceil(moments[:, 1] / self.clip_length).astype(np.int64).clip(1, vid_feat_len) |
|
moments_features = [] |
|
for st_idx, ed_idx, m in zip(moments_st_clip_indices, moments_ed_clip_indices, moments): |
|
feat = vid_feat[st_idx:ed_idx] |
|
if pool_local: |
|
feat = np.mean(feat, axis=0, keepdims=True) |
|
if normalize: |
|
feat = l2_normalize_np_array(feat) |
|
moments_features.append(feat) |
|
return moments_features |
|
|
|
|
|
def proposal_retrieval_collate(batch): |
|
batch_meta = [e["meta"] for e in batch] |
|
|
|
model_inputs_keys = batch[0]["model_inputs"].keys() |
|
batched_data = {k: pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32) |
|
for k in model_inputs_keys} |
|
return batch_meta, batched_data |
|
|
|
|
|
def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): |
|
model_inputs = {} |
|
for k, v in batched_model_inputs.items(): |
|
model_inputs[k] = v[0].to(device, non_blocking=non_blocking) |
|
model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) |
|
return model_inputs |
|
|
|
|
|
if __name__ == '__main__': |
|
from baselines.clip_alignment_with_language.config import BaseOptions |
|
options = BaseOptions().parse() |
|
|