|
|
|
|
|
|
|
|
|
|
|
import random |
|
import json |
|
import pickle |
|
from tqdm import tqdm |
|
import os |
|
import numpy as np |
|
|
|
|
|
class CaptionDedupProcessor(object): |
|
"""remove overlapping of caption sentences(clip). |
|
Some statistics: |
|
caption: |
|
{'t_clip_len': 246.6448431320854, |
|
'video_len': 281.09174795676245, |
|
'clip_tps': 0.8841283727427481, |
|
'video_tps': 0.7821156477732097, |
|
'min_clip_len': 0.0, |
|
'max_clip_len': 398.3, |
|
'mean_clip_len': 3.196580003006861, |
|
'num_clip': 77.15897706301081} |
|
|
|
raw_caption: |
|
{'t_clip_len': 238.95908778424115, |
|
'video_len': 267.5914859862507, |
|
'clip_tps': 2.4941363624267963, |
|
'video_tps': 2.258989769647173, |
|
'min_clip_len': 0.0, |
|
'max_clip_len': 398.3, |
|
'mean_clip_len': 3.0537954186814265, |
|
'num_clip': 78.24986779481756} |
|
""" |
|
|
|
def __init__(self, pkl_file): |
|
with open(pkl_file, "rb") as fd: |
|
self.data = pickle.load(fd) |
|
self.stat = { |
|
"t_clip_len": [], |
|
"video_len": [], |
|
"clip_tps": [], |
|
"video_tps": [], |
|
"clip_len": [], |
|
} |
|
|
|
def __call__(self): |
|
for idx, video_id in enumerate(tqdm(self.data)): |
|
caption = json.loads(self.data[video_id]) |
|
caption = self._dedup(caption) |
|
if idx < 4096: |
|
self.save_stat(video_id, caption) |
|
self.data[video_id] = json.dumps(caption) |
|
self.print_stat() |
|
|
|
def single(self, video_id): |
|
caption = json.loads(self.data[video_id]) |
|
for clip_idx, (start, end, text) in enumerate( |
|
zip(caption["start"], caption["end"], caption["text"]) |
|
): |
|
print(start, end, text) |
|
print("@" * 100) |
|
caption = self._dedup(caption) |
|
for clip_idx, (start, end, text) in enumerate( |
|
zip(caption["start"], caption["end"], caption["text"]) |
|
): |
|
print(start, end, text) |
|
print("#" * 100) |
|
self.save_stat(video_id, caption) |
|
self.print_stat() |
|
|
|
def finalize(self, tgt_fn): |
|
with open(tgt_fn, "wb") as fw: |
|
pickle.dump(self.data, fw, pickle.HIGHEST_PROTOCOL) |
|
|
|
def save_stat(self, video_id, caption): |
|
video_fn = os.path.join( |
|
"data/feat/feat_how2_s3d", video_id + ".npy" |
|
) |
|
if os.path.isfile(video_fn): |
|
with open(video_fn, "rb", 1) as fr: |
|
version = np.lib.format.read_magic(fr) |
|
shape, fortran, dtype = np.lib.format._read_array_header(fr, version) |
|
video_len = shape[0] |
|
|
|
t_clip_len = 0.0 |
|
t_tokens = 0 |
|
for idx, (start, end, text) in enumerate( |
|
zip(caption["start"], caption["end"], caption["text"]) |
|
): |
|
clip_len = ( |
|
(end - max(caption["end"][idx - 1], start)) |
|
if idx > 0 |
|
else end - start |
|
) |
|
t_clip_len += clip_len |
|
t_tokens += len(text.split(" ")) |
|
self.stat["clip_len"].append(clip_len) |
|
self.stat["t_clip_len"].append(t_clip_len) |
|
self.stat["video_len"].append(video_len) |
|
self.stat["clip_tps"].append(t_tokens / t_clip_len) |
|
self.stat["video_tps"].append(t_tokens / video_len) |
|
|
|
def print_stat(self): |
|
result = { |
|
"t_clip_len": np.mean(self.stat["t_clip_len"]), |
|
"video_len": np.mean(self.stat["video_len"]), |
|
"clip_tps": np.mean(self.stat["clip_tps"]), |
|
"video_tps": np.mean(self.stat["video_tps"]), |
|
"min_clip_len": min(self.stat["clip_len"]), |
|
"max_clip_len": max(self.stat["clip_len"]), |
|
"mean_clip_len": np.mean(self.stat["clip_len"]), |
|
"num_clip": len(self.stat["clip_len"]) / len(self.stat["video_tps"]), |
|
} |
|
print(result) |
|
|
|
def _dedup(self, caption): |
|
def random_merge(end_idx, start, end, text, starts, ends, texts): |
|
if random.random() > 0.5: |
|
|
|
|
|
ends[-1] = max(ends[-1], start) |
|
rest_text = text[end_idx:].strip() |
|
if rest_text: |
|
starts.append(max(ends[-1], start)) |
|
ends.append(max(end, starts[-1])) |
|
texts.append(rest_text) |
|
else: |
|
|
|
left_text = texts[-1][:-end_idx].strip() |
|
if left_text: |
|
|
|
ends[-1] = min(ends[-1], start) |
|
texts[-1] = left_text |
|
else: |
|
|
|
starts.pop(-1) |
|
ends.pop(-1) |
|
texts.pop(-1) |
|
starts.append(start) |
|
ends.append(end) |
|
texts.append(text) |
|
|
|
starts, ends, texts = [], [], [] |
|
for clip_idx, (start, end, text) in enumerate( |
|
zip(caption["start"], caption["end"], caption["text"]) |
|
): |
|
if not isinstance(text, str): |
|
continue |
|
text = text.replace("\n", " ").strip() |
|
if len(text) == 0: |
|
continue |
|
starts.append(start) |
|
ends.append(end) |
|
texts.append(text) |
|
break |
|
|
|
for clip_idx, (start, end, text) in enumerate( |
|
zip( |
|
caption["start"][clip_idx + 1:], |
|
caption["end"][clip_idx + 1:], |
|
caption["text"][clip_idx + 1:], |
|
) |
|
): |
|
if not isinstance(text, str): |
|
continue |
|
text = text.replace("\n", " ").strip() |
|
if len(text) == 0: |
|
continue |
|
|
|
|
|
|
|
if texts[-1].endswith(text): |
|
|
|
ends[-1] = max(ends[-1], end) |
|
elif text.startswith(texts[-1]): |
|
|
|
texts[-1] = text |
|
starts[-1] = min(starts[-1], start) |
|
ends[-1] = max(ends[-1], end) |
|
else: |
|
for end_idx in range(1, len(text) + 1): |
|
if texts[-1].endswith(text[:end_idx]): |
|
random_merge(end_idx, start, end, text, starts, ends, texts) |
|
break |
|
else: |
|
starts.append(start) |
|
ends.append(end) |
|
texts.append(text) |
|
|
|
assert (ends[-1] + 0.001) >= starts[-1] and len( |
|
texts[-1] |
|
) > 0, "{} {} {} <- {} {} {}, {} {} {}".format( |
|
str(starts[-1]), |
|
str(ends[-1]), |
|
texts[-1], |
|
caption["start"][clip_idx - 1], |
|
caption["end"][clip_idx - 1], |
|
caption["text"][clip_idx - 1], |
|
str(start), |
|
str(end), |
|
text, |
|
) |
|
|
|
return {"start": starts, "end": ends, "text": texts} |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser(description="dedup how2 caption") |
|
parser.add_argument('--how2dir', default="data/how2") |
|
args = parser.parse_args() |
|
|
|
raw_caption_json = os.path.join(args.how2dir, "raw_caption.json") |
|
raw_caption_pickle = os.path.join(args.how2dir, "raw_caption.pkl") |
|
raw_caption_dedup_pickle = os.path.join(args.how2dir, "raw_caption_dedup.pkl") |
|
|
|
def convert_to_pickle(src_fn, tgt_fn): |
|
with open(src_fn) as fd: |
|
captions = json.load(fd) |
|
|
|
for video_id in captions: |
|
captions[video_id] = json.dumps(captions[video_id]) |
|
|
|
with open(tgt_fn, "wb") as fw: |
|
pickle.dump(captions, fw, pickle.HIGHEST_PROTOCOL) |
|
|
|
if not os.path.isfile(raw_caption_pickle): |
|
convert_to_pickle(raw_caption_json, raw_caption_pickle) |
|
|
|
deduper = CaptionDedupProcessor(raw_caption_pickle) |
|
deduper() |
|
deduper.finalize(raw_caption_dedup_pickle) |
|
|
|
""" |
|
# demo |
|
deduper = CaptionDedupProcessor("data/how2/raw_caption.pkl") |
|
deduper.single("HfIeQ9pzL5U") |
|
""" |
|
|