import os import pdb import time import json import pprint import random import importlib import numpy as np from tqdm import tqdm, trange from collections import defaultdict import h5py import torch import torch.nn as nn import torch.backends.cudnn as cudnn from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import sys sys.path.append('/Users/kevin/univtg') from main.config import BaseOptions, setup_model from main.dataset_qfvs import DatasetQFVS, prepare_batch_inputs_qfvs, start_end_collate_qfvs from utils.basic_utils import set_seed, AverageMeter, dict_to_markdown, save_json, save_jsonl, load_json, load_pickle, l2_normalize_np_array from utils.model_utils import count_parameters from eval.qfvs import calculate_semantic_matching, load_videos_tag import logging logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) def eval_epoch(model, config, opt): model.eval() f1_sum = 0; p_sum = 0; r_sum = 0 assert len(config['test_videos']) == 1 video_id = config['test_videos'][0] embedding = load_pickle(f"./data/qfvs/txt_clip/{config['txt_feature']}.pkl") feat_type = config['vid_feature'] feat = h5py.File(f'./data/qfvs/processed/P0{video_id}_{feat_type}.h5', 'r') features = torch.from_numpy(feat['features'][()]) seg_len = torch.from_numpy(feat['seg_len'][()]) # seg_len = torch.tensor(feat['seg_len'][()]).unsqueeze(0).cuda() # dim = features.shape[-1] # ctx_l = seg_len.sum().cpu() # dim = features.shape[-1] # ctx_l = features.shape[1] # seg_len = torch.ones(ctx_l) # features = features.reshape(-1, dim)[:ctx_l] # tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l # tef_ed = tef_st + 1.0 / ctx_l # tef = torch.stack([tef_st, tef_ed], dim=1).cuda() # (Lv, 2) # features = torch.cat([features, tef], dim=1) # (Lv, Dv+2) transfer = {"Cupglass": "Glass", "Musicalinstrument": "Instrument", "Petsanimal": "Animal"} with open(os.path.join('./plot', opt.dset_name, str(opt.qfvs_split) +'.jsonl'), 'w') as f_write: for _,_,files in os.walk("./data/qfvs/metadata/origin_data/Query-Focused_Summaries/Oracle_Summaries/P0"+str(video_id)): evaluation_num=len(files) mask_GT = torch.zeros(config["max_segment_num"], config["max_frame_num"], dtype=torch.bool).cuda() for j in range(len(seg_len)): for k in range(seg_len[j]): mask_GT[j][k] = 1 for file in files: summaries_GT=[] with open("./data/qfvs/metadata/origin_data/Query-Focused_Summaries/Oracle_Summaries/P0"+str(video_id)+"/"+file,"r") as f: for line in f.readlines(): summaries_GT.append(int(line.strip())) concept1, concept2 = file.split('_')[0:2] ############## if concept1 in transfer: concept1 = transfer[concept1] if concept2 in transfer: concept2 = transfer[concept2] concept1 = embedding[concept1] concept2 = embedding[concept2] concept1 = l2_normalize_np_array(concept1) concept2 = l2_normalize_np_array(concept2) data = { 'features':features, 'seg_len': seg_len, 'tokens_pad1':torch.from_numpy(concept1), 'tokens_pad2':torch.from_numpy(concept2), 'mask_GT': mask_GT } input1, input2, input_oracle, mask = prepare_batch_inputs_qfvs(start_end_collate_qfvs([data]), config, eval=True) summaries_GT = [x - 1 for x in summaries_GT] video_shots_tag = load_videos_tag(mat_path="./eval/Tags.mat") if opt.f_loss_coef == 0: output_type = 'saliency_scores' elif opt.s_loss_intra_coef == 0: output_type = 'pred_logits' else: if config['qfvs_score_ensemble'] > 0: output_type = ['pred_logits', 'saliency_scores'] else: output_type = 'pred_logits' with torch.no_grad(): if not isinstance(output_type, list): score1 = model(**input1)[output_type].squeeze() score1 = score1.masked_select(mask_GT) score2 = model(**input2)[output_type].squeeze() score2 = score2.masked_select(mask_GT) score = model(**input_oracle)[output_type].squeeze() score = score.masked_select(mask_GT) else: score1, score2, score = torch.zeros((int(mask.sum().item()))).cuda(), torch.zeros((int(mask.sum().item()))).cuda(), torch.zeros((int(mask.sum().item()))).cuda() for output_t in output_type: score1 += model(**input1)[output_t].squeeze().masked_select(mask_GT) score2 += model(**input2)[output_t].squeeze().masked_select(mask_GT) score += model(**input_oracle)[output_t].squeeze().masked_select(mask_GT) if config['qfvs_score_gather'] > 0: score = score + score1 + score2 else: score = score # since video4 features dim is greater than video_shots_tag. score = score[:min(score.shape[0], video_shots_tag[video_id-1].shape[0])] _, top_index = score.topk(int(score.shape[0] * config["top_percent"])) c1, c2 = file.split('_')[0:2] if c1 in transfer: c1 = transfer[c1] if c2 in transfer: c2 = transfer[c2] p, r, f1 = calculate_semantic_matching(list(top_index.cpu().numpy()), summaries_GT, video_shots_tag, video_id=video_id-1) entry = {'concept1': c1, 'concept2': c2, 'score':score.tolist(), 'top_percent': config["top_percent"], 'top_pred':top_index.tolist(), 'gt':summaries_GT, 'p': p, 'r': r, 'f1': f1, 'shots': video_shots_tag[video_id-1].shape[0]} f_write.write(json.dumps(entry) + '\n') f1_sum+=f1; r_sum+=r; p_sum+=p return {'F': round(100* f1_sum/evaluation_num,2) , 'R': round(100* r_sum/evaluation_num,2) , 'P': round(100* p_sum/evaluation_num,2) } def idx2time(idx): sec1, sec2 = idx*5, (idx+1)*5 h1 = sec1 // 3600 m1 = (sec1 - h1*3600) // 60 s1 = sec1 % 60 h2 = sec2 // 3600 m2 = (sec2 - h2*3600) // 60 s2 = sec2 % 60 print(h1,m1,s1,'\t', h2,m2,s2) def train_epoch(model, criterion, train_loader, optimizer, opt, config, epoch_i, tb_writer): model.train() criterion.train() # init meters time_meters = defaultdict(AverageMeter) loss_meters = defaultdict(AverageMeter) timer_dataloading = time.time() loss_total = 0 for batch_idx, batch in enumerate(tqdm(train_loader)): time_meters["dataloading_time"].update(time.time() - timer_dataloading) timer_start = time.time() model_input1, model_input2, model_input_oracle, \ model_gt1, model_gt2, model_gt_oracle, \ mask_GT = prepare_batch_inputs_qfvs(batch, config) time_meters["prepare_inputs_time"].update(time.time() - timer_start) timer_start = time.time() output1 = model(**model_input1) output2 = model(**model_input2) output_oracle = model(**model_input_oracle) loss_dict = {} loss_dict1 = criterion(output1, model_gt1, mask_GT) loss_dict2 = criterion(output2, model_gt2, mask_GT) loss_dict3 = criterion(output_oracle, model_gt_oracle, mask_GT) weight_dict = criterion.weight_dict if config['qfvs_loss_gather'] > 0: for k in loss_dict1.keys(): loss_dict[k] = loss_dict1[k] + loss_dict2[k] + loss_dict3[k] else: loss_dict = loss_dict3 losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) loss_total += losses.item() time_meters["model_forward_time"].update(time.time() - timer_start) timer_start = time.time() optimizer.zero_grad() losses.backward() if opt.grad_clip > 0: nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() time_meters["model_backward_time"].update(time.time() - timer_start) timer_dataloading = time.time() return round(loss_total / len(train_loader), 2) # train in single domain. def train(model, criterion, optimizer, lr_scheduler, train_loader, opt, config): # if opt.device.type == "cuda": # logger.info("CUDA enabled.") # model.to(opt.device) tb_writer = SummaryWriter(opt.tensorboard_log_dir) tb_writer.add_text("hyperparameters", dict_to_markdown(vars(opt), max_str_len=None)) opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str} [Metrics] {eval_metrics_str}\n" prev_best_score = {'Fscore':0, 'Precision':0, 'Recall':0} if opt.start_epoch is None: start_epoch = -1 if opt.eval_init else 0 else: start_epoch = opt.start_epoch val_score = eval_epoch(model, config, opt) tb_writer.add_scalar(f"Eval/QFVS-V{config['test_videos'][0]}-fscore", float(val_score['F']), 0) logger.info(f"[Epoch {0}] [Fscore: {val_score['F']} / {prev_best_score['Fscore']}]" f" [Precision: {val_score['P']} / {prev_best_score['Precision']}]" f" [Recall: {val_score['R']} / {prev_best_score['Recall']}]") for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"): if epoch_i > -1: loss_epoch = train_epoch(model, criterion, train_loader, optimizer, opt, config, epoch_i, tb_writer) lr_scheduler.step() eval_epoch_interval = opt.eval_epoch if opt.eval_path is not None and (epoch_i + 1) % eval_epoch_interval == 0: with torch.no_grad(): val_score = eval_epoch(model, config, opt) tb_writer.add_scalar(f"Eval/QFVS-V{config['test_videos'][0]}-fscore", float(val_score['F']), epoch_i+1) logger.info(f"[Epoch {epoch_i + 1}, Loss {loss_epoch}] [Fscore: {val_score['F']} / {prev_best_score['Fscore']}]" f" [Precision: {val_score['P']} / {prev_best_score['Precision']}]" f" [Recall: {val_score['R']} / {prev_best_score['Recall']}]") if prev_best_score['Fscore'] < val_score['F']: prev_best_score['Fscore'] = val_score['F'] prev_best_score['Precision'] = val_score['P'] prev_best_score['Recall'] = val_score['R'] checkpoint = { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch_i, "opt": opt } torch.save(checkpoint, opt.ckpt_filepath.replace(".ckpt", f"_V{config['test_videos'][0]}_best.ckpt")) tb_writer.close() return prev_best_score def update_config(opt, config): # for key in ["max_segment_num", "max_frame_num", "top_percent", # "qfvs_vid_feature", "qfvs_txt_feature", "qfvs_dense_shot", # "qfvs_score_ensemble", "qfvs_score_gather", "qfvs_loss_gather"]: config["max_segment_num"] = opt.max_segment_num config["max_frame_num"] = opt.max_frame_num config["top_percent"] = opt.top_percent config["vid_feature"] = opt.qfvs_vid_feature config["txt_feature"] = opt.qfvs_txt_feature config["qfvs_dense_shot"] = opt.qfvs_dense_shot config["qfvs_score_ensemble"] = opt.qfvs_score_ensemble config["qfvs_score_gather"] = opt.qfvs_score_gather config["qfvs_loss_gather"] = opt.qfvs_loss_gather return config def start_training(): logger.info("Setup config, data and model...") opt = BaseOptions().parse() set_seed(opt.seed) # config = load_json("./main/config_qfvs.json") config = {} config = update_config(opt, config) tb_writer = SummaryWriter(opt.tensorboard_log_dir) # key -> test video; value -> training videos. qfvs_split = { 1: [2, 3, 4], 2: [1, 3, 4], 3: [1, 2, 4], 4: [1, 2, 3] } scores_videos = {} for test_id, splits in qfvs_split.items(): if opt.qfvs_split != -1: if test_id != opt.qfvs_split: continue logger.info(f"Start Training {opt.dset_name}: {test_id}") config['train_videos'] = qfvs_split[test_id] config['test_videos'] = [test_id] train_dataset = DatasetQFVS(config) train_loader = DataLoader(train_dataset, batch_size=opt.bsz, collate_fn=start_end_collate_qfvs, shuffle=True, num_workers=opt.num_workers) model, criterion, optimizer, lr_scheduler = setup_model(opt) count_parameters(model) best_score = train(model, criterion, optimizer, lr_scheduler, train_loader, opt, config) scores_videos['V'+str(test_id)] = best_score # save the final results. avg_fscore = sum([v['Fscore'] for k, v in scores_videos.items()]) / len(scores_videos) avg_precision = sum([v['Precision'] for k, v in scores_videos.items()]) / len(scores_videos) avg_recall = sum([v['Recall'] for k, v in scores_videos.items()]) / len(scores_videos) scores_videos['avg'] = {'Fscore':avg_fscore, 'Precision':avg_precision, 'Recall':avg_recall} save_metrics_path = os.path.join(opt.results_dir, f"best_{opt.dset_name}_{opt.eval_split_name}_preds_metrics.json") save_json( scores_videos, save_metrics_path, save_pretty=True, sort_keys=False) tb_writer.add_scalar(f"Eval/QFVS-avg-fscore", round(avg_fscore, 2), 1) tb_writer.add_text(f"Eval/QFVS-{opt.dset_name}", dict_to_markdown(scores_videos, max_str_len=None)) tb_writer.close() print(scores_videos) return if __name__ == '__main__': start_training() results = logger.info("\n\n\nFINISHED TRAINING!!!")