# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import random import json import pickle from tqdm import tqdm import os import numpy as np class CaptionDedupProcessor(object): """remove overlapping of caption sentences(clip). Some statistics: caption: {'t_clip_len': 246.6448431320854, 'video_len': 281.09174795676245, 'clip_tps': 0.8841283727427481, 'video_tps': 0.7821156477732097, 'min_clip_len': 0.0, 'max_clip_len': 398.3, 'mean_clip_len': 3.196580003006861, 'num_clip': 77.15897706301081} raw_caption: {'t_clip_len': 238.95908778424115, 'video_len': 267.5914859862507, 'clip_tps': 2.4941363624267963, 'video_tps': 2.258989769647173, 'min_clip_len': 0.0, 'max_clip_len': 398.3, 'mean_clip_len': 3.0537954186814265, 'num_clip': 78.24986779481756} """ def __init__(self, pkl_file): with open(pkl_file, "rb") as fd: self.data = pickle.load(fd) self.stat = { "t_clip_len": [], "video_len": [], "clip_tps": [], "video_tps": [], "clip_len": [], } def __call__(self): for idx, video_id in enumerate(tqdm(self.data)): caption = json.loads(self.data[video_id]) caption = self._dedup(caption) if idx < 4096: # for the first 4096 examples, compute the statistics. self.save_stat(video_id, caption) self.data[video_id] = json.dumps(caption) self.print_stat() def single(self, video_id): caption = json.loads(self.data[video_id]) for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): print(start, end, text) print("@" * 100) caption = self._dedup(caption) for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): print(start, end, text) print("#" * 100) self.save_stat(video_id, caption) self.print_stat() def finalize(self, tgt_fn): with open(tgt_fn, "wb") as fw: pickle.dump(self.data, fw, pickle.HIGHEST_PROTOCOL) def save_stat(self, video_id, caption): video_fn = os.path.join( "data/feat/feat_how2_s3d", video_id + ".npy" ) if os.path.isfile(video_fn): with open(video_fn, "rb", 1) as fr: # 24 is the buffer size. buffered version = np.lib.format.read_magic(fr) shape, fortran, dtype = np.lib.format._read_array_header(fr, version) video_len = shape[0] t_clip_len = 0.0 t_tokens = 0 for idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): clip_len = ( (end - max(caption["end"][idx - 1], start)) if idx > 0 else end - start ) t_clip_len += clip_len t_tokens += len(text.split(" ")) self.stat["clip_len"].append(clip_len) self.stat["t_clip_len"].append(t_clip_len) self.stat["video_len"].append(video_len) self.stat["clip_tps"].append(t_tokens / t_clip_len) self.stat["video_tps"].append(t_tokens / video_len) def print_stat(self): result = { "t_clip_len": np.mean(self.stat["t_clip_len"]), "video_len": np.mean(self.stat["video_len"]), "clip_tps": np.mean(self.stat["clip_tps"]), "video_tps": np.mean(self.stat["video_tps"]), "min_clip_len": min(self.stat["clip_len"]), "max_clip_len": max(self.stat["clip_len"]), "mean_clip_len": np.mean(self.stat["clip_len"]), "num_clip": len(self.stat["clip_len"]) / len(self.stat["video_tps"]), } print(result) def _dedup(self, caption): def random_merge(end_idx, start, end, text, starts, ends, texts): if random.random() > 0.5: # print(clip_idx, "[PARTIAL INTO PREV]", end_idx) # overlapped part goes to the end of previous. ends[-1] = max(ends[-1], start) # ? rest_text = text[end_idx:].strip() if rest_text: starts.append(max(ends[-1], start)) ends.append(max(end, starts[-1])) texts.append(rest_text) else: # goes to the beginning of the current. # strip the previous. left_text = texts[-1][:-end_idx].strip() if left_text: # print(clip_idx, "[PREV PARTIAL INTO CUR]", end_idx) ends[-1] = min(ends[-1], start) texts[-1] = left_text else: # print(clip_idx, "[PREV LEFT NOTHING ALL INTO CUR]", end_idx) starts.pop(-1) ends.pop(-1) texts.pop(-1) starts.append(start) ends.append(end) texts.append(text) starts, ends, texts = [], [], [] for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): if not isinstance(text, str): continue text = text.replace("\n", " ").strip() if len(text) == 0: continue starts.append(start) ends.append(end) texts.append(text) break for clip_idx, (start, end, text) in enumerate( zip( caption["start"][clip_idx + 1:], caption["end"][clip_idx + 1:], caption["text"][clip_idx + 1:], ) ): if not isinstance(text, str): continue text = text.replace("\n", " ").strip() if len(text) == 0: continue # print(clip_idx, texts[-5:]) # print(clip_idx, start, end, text) if texts[-1].endswith(text): # subset of prev caption -> merge # print(clip_idx, "[MERGE INTO PREV]") ends[-1] = max(ends[-1], end) elif text.startswith(texts[-1]): # superset of prev caption -> merge # print(clip_idx, "[PREV MERGE INTO CUR]") texts[-1] = text starts[-1] = min(starts[-1], start) ends[-1] = max(ends[-1], end) else: # overlapping or non-overlapping. for end_idx in range(1, len(text) + 1): if texts[-1].endswith(text[:end_idx]): random_merge(end_idx, start, end, text, starts, ends, texts) break else: starts.append(start) ends.append(end) texts.append(text) assert (ends[-1] + 0.001) >= starts[-1] and len( texts[-1] ) > 0, "{} {} {} <- {} {} {}, {} {} {}".format( str(starts[-1]), str(ends[-1]), texts[-1], caption["start"][clip_idx - 1], caption["end"][clip_idx - 1], caption["text"][clip_idx - 1], str(start), str(end), text, ) return {"start": starts, "end": ends, "text": texts} if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="dedup how2 caption") parser.add_argument('--how2dir', default="data/how2") args = parser.parse_args() raw_caption_json = os.path.join(args.how2dir, "raw_caption.json") raw_caption_pickle = os.path.join(args.how2dir, "raw_caption.pkl") raw_caption_dedup_pickle = os.path.join(args.how2dir, "raw_caption_dedup.pkl") def convert_to_pickle(src_fn, tgt_fn): with open(src_fn) as fd: captions = json.load(fd) for video_id in captions: captions[video_id] = json.dumps(captions[video_id]) with open(tgt_fn, "wb") as fw: pickle.dump(captions, fw, pickle.HIGHEST_PROTOCOL) if not os.path.isfile(raw_caption_pickle): convert_to_pickle(raw_caption_json, raw_caption_pickle) deduper = CaptionDedupProcessor(raw_caption_pickle) deduper() deduper.finalize(raw_caption_dedup_pickle) """ # demo deduper = CaptionDedupProcessor("data/how2/raw_caption.pkl") deduper.single("HfIeQ9pzL5U") """