File size: 18,335 Bytes
ebf5d87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
import os
import time
import torch
import argparse
from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
class BaseOptions(object):
saved_option_filename = "opt.json"
ckpt_filename = "model.ckpt"
tensorboard_log_dir = "tensorboard_log"
train_log_filename = "train.log.txt"
eval_log_filename = "eval.log.txt"
def __init__(self):
self.parser = argparse.ArgumentParser()
self.initialized = False
self.opt = None
def initialize(self):
self.initialized = True
self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
self.parser.add_argument("--eval_split_name", type=str, default="val",
help="should match keys in corpus_path, must set for VCMR")
self.parser.add_argument("--debug", action="store_true",
help="debug (fast) mode, break all loops, do not load all data into memory.")
self.parser.add_argument("--data_ratio", type=float, default=1.0,
help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
"Use small portion for debug purposes. Note this is different from --debug, "
"which works by breaking the loops, typically they are not used together.")
self.parser.add_argument("--results_root", type=str, default="results")
self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
self.parser.add_argument("--num_workers", type=int, default=8,
help="num subprocesses used to load the data, 0: use main process")
self.parser.add_argument("--no_core_driver", action="store_true",
help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
self.parser.add_argument("--no_pin_memory", action="store_true",
help="Don't use pin_memory=True for dataloader. "
"ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
# training config
self.parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10% of training.")
self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
self.parser.add_argument("--n_epoch", type=int, default=30, help="number of epochs to run")
self.parser.add_argument("--max_es_cnt", type=int, default=10,
help="number of epochs to early stop, use -1 to disable early stop")
self.parser.add_argument("--stop_task", type=str, default="SVMR", choices=["VCMR", "SVMR", "VR"])
self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+",
default=["SVMR"], choices=["VCMR", "SVMR", "VR"],
help="evaluate and report numbers for tasks specified here.")
self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
self.parser.add_argument("--eval_query_bsz", type=int, default=50,
help="mini-batch size at inference, for query")
self.parser.add_argument("--eval_context_bsz", type=int, default=200,
help="mini-batch size at inference, for video/sub")
self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
self.parser.add_argument("--lw_neg_q", type=float, default=1,
help="weight for ranking loss with negative query and positive context")
self.parser.add_argument("--lw_neg_ctx", type=float, default=1,
help="weight for ranking loss with positive query and negative context")
self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
self.parser.add_argument("--train_span_start_epoch", type=int, default=0,
help="which epoch to start training span prediction, -1 to disable")
self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],
help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20,
help="which epoch to start hard negative sampling for video-level ranking loss,"
"use -1 to disable")
self.parser.add_argument("--hard_pool_size", type=int, default=20,
help="hard negatives are still sampled, but from a harder pool.")
# Model and Data config
self.parser.add_argument("--max_sub_l", type=int, default=50,
help="max length of all sub sentence 97.71 under 50 for 3 sentences")
self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
self.parser.add_argument("--max_ctx_l", type=int, default=100,
help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
self.parser.add_argument("--train_path", type=str, default=None)
self.parser.add_argument("--eval_path", type=str, default=None,
help="Evaluating during training, for Dev set. If None, will only do training, "
"anet_cap and charades_sta has no dev set, so None")
self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
self.parser.add_argument("--word2idx_path", type=str,
help="a dict, {word: word_idx, ...}, "
"special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
self.parser.add_argument("--vocab_size", type=int, default=-1,
help="Set automatically to len(word2idx)")
self.parser.add_argument("--glove_path", type=str,
help="path to file containing the GloVe embeddings for words in word2idx")
self.parser.add_argument("--desc_bert_path", type=str, default=None)
self.parser.add_argument("--sub_bert_path", type=str, default=None)
self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature")
self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef",
"video_tef", "sub_tef", "video_sub_tef"],
help="which context to use. a combination of [video, sub, tef]")
self.parser.add_argument("--corpus_path", type=str, default=None)
self.parser.add_argument("--vid_feat_path", type=str, default="")
self.parser.add_argument("--no_norm_vfeat", action="store_true",
help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
self.parser.add_argument("--clip_length", type=float, default=None,
help="each video will be uniformly segmented into small clips, "
"will automatically loaded from ProposalConfigs if None")
self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
help="if set, use external video retrieval results to guide evaluation. ")
self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"],
help="how to generate span predictions, "
"conv: apply 1D-Conv layer on top of NxL dot product of query and clips"
"cat_linear: cat the query and clips then use a linear layer to give output. "
"Note cat_linear is implemented as first project query and clips into scores, "
"separately, then sum them up, this should be similar to first cat then project.")
self.parser.add_argument("--encoder_type", type=str, default="transformer",
choices=["gru", "lstm", "transformer", "cnn"])
self.parser.add_argument("--add_pe_rnn", action="store_true",
help="Add positional encoding for GRU and LSTM encoder as well")
self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles")
self.parser.add_argument("--no_cross_att", action="store_true",
help="Use cross-attention for modeling video and subtitles")
self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention")
self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention")
self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"],
help="Only for query encoding")
self.parser.add_argument("--max_position_embeddings", type=int, default=300)
self.parser.add_argument("--hidden_size", type=int, default=128)
self.parser.add_argument("--n_heads", type=int, default=4)
self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att")
self.parser.add_argument("--conv_kernel_size", type=int, default=5)
self.parser.add_argument("--conv_stride", type=int, default=1)
self.parser.add_argument("--initializer_range", type=float, default=0.02,
help="initializer range for linear layer")
# post processing
self.parser.add_argument("--min_pred_l", type=int, default=2,
help="constrain the [st, ed] with ed - st >= 2"
"(2 clips with length 1.5 each, 3 secs in total"
"this is the min length for proposal-based method)")
self.parser.add_argument("--max_pred_l", type=int, default=16,
help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total"
"(16 clips with length 1.5 each, "
"this is the max length for proposal-based method)")
self.parser.add_argument("--q2c_alpha", type=float, default=20,
help="give more importance to top scored videos' spans, "
"the new score will be: s_new = exp(alpha * s), "
"higher alpha indicates more importance. Note s in [-1, 1]")
self.parser.add_argument("--max_before_nms", type=int, default=200)
self.parser.add_argument("--max_vcmr_video", type=int, default=100,
help="re-ranking in top-max_vcmr_video")
self.parser.add_argument("--nms_thd", type=float, default=-1,
help="additionally use non-maximum suppression "
"(or non-minimum suppression for distance)"
"to post-processing the predictions. "
"-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
def display_save(self, opt):
args = vars(opt)
# Display settings
print("------------ Options -------------\n{}\n-------------------"
.format({str(k): str(v) for k, v in sorted(args.items())}))
# Save settings
if not isinstance(self, TestOptions):
option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed
save_json(args, option_file_path, save_pretty=True)
def parse(self):
if not self.initialized:
self.initialize()
opt = self.parser.parse_args()
if opt.debug:
opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
opt.no_core_driver = True
opt.num_workers = 0
opt.eval_query_bsz = 100
if isinstance(self, TestOptions):
# modify model_dir to absolute path
opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
for arg in saved_options: # use saved options to overwrite all BaseOptions args.
if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name",
"eval_path", "max_pred_l", "min_pred_l"]:
setattr(opt, arg, saved_options[arg])
# opt.no_core_driver = True
else:
if opt.exp_id is None:
raise ValueError("--exp_id is required for at a training option!")
if opt.clip_length is None:
opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
print("Loaded clip_length {} from proposal config file".format(opt.clip_length))
opt.results_dir = os.path.join(opt.results_root,
"-".join([opt.dset_name, opt.ctx_mode, opt.exp_id,
time.strftime("%Y_%m_%d_%H_%M_%S")]))
mkdirp(opt.results_dir)
# save a copy of current code
code_dir = os.path.dirname(os.path.realpath(__file__))
code_zip_filename = os.path.join(opt.results_dir, "code.zip")
make_zipfile(code_dir, code_zip_filename,
enclosing_dir="code",
exclude_dirs_substring="results",
exclude_dirs=["results", "debug_results", "__pycache__"],
exclude_extensions=[".pyc", ".ipynb", ".swap"],)
self.display_save(opt)
if "sub" in opt.ctx_mode:
assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
if opt.hard_negtiave_start_epoch != -1:
if opt.hard_pool_size > opt.bsz:
print("[WARNING] hard_pool_size is larger than bsz")
assert opt.stop_task in opt.eval_tasks_at_training
opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
opt.h5driver = None if opt.no_core_driver else "core"
# num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
opt.pin_memory = not opt.no_pin_memory
if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d
assert opt.no_norm_vfeat
if "tef" in opt.ctx_mode and "video" in opt.ctx_mode:
opt.vid_feat_size += 2
if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode:
opt.sub_feat_size += 2
if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode:
opt.no_merge_two_stream = True
opt.no_cross_att = True
self.opt = opt
return opt
class TestOptions(BaseOptions):
"""add additional options for evaluating"""
def initialize(self):
BaseOptions.initialize(self)
# also need to specify --eval_split_name
self.parser.add_argument("--eval_id", type=str, help="evaluation id")
self.parser.add_argument("--model_dir", type=str,
help="dir contains the model file, will be converted to absolute path afterwards")
self.parser.add_argument("--tasks", type=str, nargs="+",
choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
help="Which tasks to run."
"VCMR: Video Corpus Moment Retrieval;"
"SVMR: Single Video Moment Retrieval;"
"VR: regular Video Retrieval. (will be performed automatically with VCMR)")
|