File size: 13,518 Bytes
ebf5d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import os
import time
import torch
import argparse

from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs


class BaseOptions(object):
    saved_option_filename = "opt.json"
    ckpt_filename = "model.ckpt"
    tensorboard_log_dir = "tensorboard_log"
    train_log_filename = "train.log.txt"
    eval_log_filename = "eval.log.txt"

    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.initialized = False
        self.opt = None

    def initialize(self):
        self.initialized = True
        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
        self.parser.add_argument("--eval_split_name", type=str, default="val",
                                 help="should match keys in corpus_path, must set for VCMR")
        self.parser.add_argument("--debug", action="store_true",
                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
        self.parser.add_argument("--data_ratio", type=float, default=1.0,
                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
                                      "Use small portion for debug purposes. Note this is different from --debug, "
                                      "which works by breaking the loops, typically they are not used together.")
        self.parser.add_argument("--results_root", type=str, default="results")
        self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run")
        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
        self.parser.add_argument("--num_workers", type=int, default=8,
                                 help="num subprocesses used to load the data, 0: use main process")
        self.parser.add_argument("--no_core_driver", action="store_true",
                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
        self.parser.add_argument("--no_pin_memory", action="store_true",
                                 help="Don't use pin_memory=True for dataloader. "
                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")

        # training config
        self.parser.add_argument("--lr", type=float, default=0.05, help="learning rate")
        self.parser.add_argument("--wd", type=float, default=0, help="weight decay")
        self.parser.add_argument("--momentum", type=float, default=0.95, help="momentum for SGD")
        self.parser.add_argument("--n_epoch", type=int, default=108, help="number of epochs to run")
        self.parser.add_argument("--max_es_cnt", type=int, default=108, help="number of epochs to early stop")
        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
        self.parser.add_argument("--eval_query_bsz", type=int, default=1000,
                                 help="mini-batch size at inference, for query")
        self.parser.add_argument("--eval_proposal_bsz", type=int, default=200,
                                 help="mini-batch size at inference, for proposals")
        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
        self.parser.add_argument("--inter_loss_weight", type=float, default=0.4, help="margin for ranking loss")
        self.parser.add_argument("--loss_type", type=str, default="hinge", choices=["hinge", "lse"],
                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")

        # Model and Data config
        self.parser.add_argument("--max_sub_l", type=int, default=50,
                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
        self.parser.add_argument("--pos_iou_thd", type=float, default=0.7, help="moments with IoU >= as positive")
        self.parser.add_argument("--neg_iou_thd", type=float, default=0.35, help="moments with IoU < as negative")

        self.parser.add_argument("--train_path", type=str, default=None)
        self.parser.add_argument("--eval_path", type=str, default=None,
                                 help="Evaluating during training, for Dev set. If None, will only do training, "
                                      "anet_cap and charades_sta has no dev set, so None")
        self.parser.add_argument("--external_train_vr_res_path", type=str, default=None,
                                 help="if set, use external video retrieval results to guide "
                                      "inter-nvideo negative sampling. ")
        self.parser.add_argument("--init_ckpt_path", type=str, default=None,
                                 help="init model parameters from checkpoint. Use absolute path")
        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
                                 help="if set, use external video retrieval results to guide evaluation. ")
        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
        self.parser.add_argument("--word2idx_path", type=str,
                                 help="a dict, {word: word_idx, ...}, "
                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
        self.parser.add_argument("--vocab_size", type=int, default=-1,
                                 help="Set automatically to len(word2idx)")
        self.parser.add_argument("--glove_path", type=str,
                                 help="path to file containing the GloVe embeddings for words in word2idx")
        self.parser.add_argument("--desc_bert_path", type=str, default=None)
        self.parser.add_argument("--sub_bert_path", type=str, default=None)
        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
        self.parser.add_argument("--desc_feat_size", type=int, default=768)
        self.parser.add_argument("--ctx_mode", type=str,
                                 choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"],
                                 help="which context to use. a combination of [video, sub, tef]")
        self.parser.add_argument("--corpus_path", type=str, default=None)
        self.parser.add_argument("--vid_feat_path", type=str, default="")
        self.parser.add_argument("--no_norm_vfeat", action="store_true",
                                 help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
        self.parser.add_argument("--clip_length", type=float, default=None,
                                 help="each video will be uniformly segmented into small clips, "
                                      "will automatically loaded from ProposalConfigs if None")
        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")

        self.parser.add_argument("--model_type", default="cal", choices=["cal", "mcn"])
        self.parser.add_argument("--embedding_size", type=int, default=768)
        self.parser.add_argument("--lstm_hidden_size", type=int, default=256)
        self.parser.add_argument("--visual_hidden_size", type=int, default=256)
        self.parser.add_argument("--output_size", type=int, default=256)

        # post processing
        self.parser.add_argument("--nms_thd", type=float, default=-1,
                                 help="additionally use non-maximum suppression "
                                      "(or non-minimum suppression for distance)"
                                      "to post-processing the predictions. "
                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
        self.parser.add_argument("--max_after_nms", type=int, default=100, help="Stores at max_after_nms for eval")
        self.parser.add_argument("--max_before_nms", type=int, default=300, help="Max before nms")
        self.parser.add_argument("--use_intermediate", action="store_true",
                                 help="Whether to use/save intermediate results to results directory."
                                      "Might want use this if we are going to ")

    def save_args(self, opt):
        args = vars(opt)
        # Save settings
        if not isinstance(self, TestOptions):
            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
            save_json(args, option_file_path, save_pretty=True)

    def parse(self):
        if not self.initialized:
            self.initialize()
        opt = self.parser.parse_args()

        if opt.debug:
            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
            opt.no_core_driver = True
            opt.num_workers = 0

        if isinstance(self, TestOptions):
            # modify model_dir to absolute path
            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
                if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path",
                               "use_intermediate", "external_inference_vr_res_path"]:
                    setattr(opt, arg, saved_options[arg])
            # opt.no_core_driver = True
        else:
            if opt.exp_id is None:
                raise ValueError("--exp_id is required for at a training option!")

            if opt.clip_length is None:
                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
            opt.results_dir = os.path.join(opt.results_root,
                                           "-".join([opt.dset_name, opt.model_type, opt.ctx_mode, opt.exp_id,
                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
            mkdirp(opt.results_dir)
            # save a copy of current code
            code_dir = os.path.dirname(os.path.realpath(__file__))
            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
            make_zipfile(code_dir, code_zip_filename,
                         enclosing_dir="code",
                         exclude_dirs_substring="results",
                         exclude_dirs=["results", "debug_results", "__pycache__"],
                         exclude_extensions=[".pyc", ".ipynb", ".swap"])

        self.save_args(opt)

        if "sub" in opt.ctx_mode:
            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"

        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
            assert opt.no_norm_vfeat

        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
        opt.h5driver = None if opt.no_core_driver else "core"
        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
        opt.pin_memory = not opt.no_pin_memory
        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers

        # Display settings
        print("------------ Options -------------\n{}\n-------------------"
              .format({str(k): str(v) for k, v in sorted(vars(opt).items())}))
        self.opt = opt
        return opt


class TestOptions(BaseOptions):
    """add additional options for evaluating"""
    def initialize(self):
        BaseOptions.initialize(self)
        # also need to specify --eval_split_name
        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
        self.parser.add_argument("--model_dir", type=str,
                                 help="dir contains the model file, will be converted to absolute path afterwards")
        self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR",
                                 help="Which tasks to run."
                                      "VCMR: Video Corpus Moment Retrieval;"
                                      "SVMR: Single Video Moment Retrieval;"
                                      "VR: regular Video Retrieval.")