## Import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
from fairseq import utils, tasks
from fairseq import checkpoint_utils
from utils.eval_utils import eval_step
from tasks.mm_tasks.caption import CaptionTask
from models.unival import UnIVALModel
from PIL import Image

import random
from torchvision.transforms import functional as F
from torchvision.transforms import InterpolationMode

from matplotlib import pyplot as plt

# turn on cuda if GPU is available
use_cuda = torch.cuda.is_available()
# use fp16 only when GPU is available
use_fp16 = False
import os 

In [3]:
# Register refcoco task
tasks.register_task('video_caption', CaptionTask)

.register_task_cls(cls)>

### Load model

In [80]:
# Load pretrained ckpt & config

checkpoint_path = '/data/mshukor/logs/ofa/best_models/unival_video_caption_stage_1/checkpoint_best.pt'
video_model_path = '/data/mshukor/logs/ofa/best_models/resnext-101-kinetics.pth'

overrides={"eval_cider":False, "beam":5, "max_len_b":22, "no_repeat_ngram_size":3, "seed":7, "unnormalized": False,
 "bpe_dir":"utils/BPE", "video_model_path": video_model_path,}

models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
 utils.split_paths(checkpoint_path),
 arg_overrides=overrides
 )

# Move models to GPU
for model in models:
 model.eval()
 if use_fp16:
 model.half()
 if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
 model.cuda()
 model.prepare_for_inference_(cfg)

# Initialize generator
generator = task.build_generator(models, cfg.generation)

self.sample_patch_num 784
self.sample_audio_patch_num None
self.sample_video_patch_num None
self.with_cls False
Loading: all_resnext101
use bn: 
load pretrained_model /data/mshukor/logs/ofa/best_models/resnext-101-kinetics.pth
_IncompatibleKeys(missing_keys=[], unexpected_keys=['fc.weight', 'fc.bias'])
unival
getattr(args, "stop_on_max_len", False) False


### Preprocess

In [81]:
# Image transform
from torchvision import transforms
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]



type_transform = transforms.Lambda(lambda x: x.float().div(255.0))
patch_video_resize_transform = transforms.Compose([
 transforms.CenterCrop(cfg.task.patch_frame_size),
 type_transform, 
 transforms.Normalize(mean=mean, std=std),
 ])

# video process
from data.video_utils import VIDEO_READER_FUNCS

video_reader = VIDEO_READER_FUNCS['decord'] 

def process_video(video_path, max_num_frames=16, num_frames=16, sample_type='rand',):
 
 # video 
 data_path = os.path.join(video_path)

 frames, frame_indices, video_duration = video_reader(
 data_path, num_frames, sample_type, max_num_frames=max_num_frames
 )

 patch_video = patch_video_resize_transform(frames)
 patch_video = patch_video.permute(1, 0, 2, 3) # -> (C, T, h, w)

 return patch_video.unsqueeze(0)
 

# Text preprocess
bos_item = torch.LongTensor([task.src_dict.bos()])
eos_item = torch.LongTensor([task.src_dict.eos()])
pad_idx = task.src_dict.pad()
def encode_text(text, length=None, append_bos=False, append_eos=False):
 s = task.tgt_dict.encode_line(
 line=task.bpe.encode(text),
 add_if_not_exist=False,
 append_eos=False
 ).long()
 if length is not None:
 s = s[:length]
 if append_bos:
 s = torch.cat([bos_item, s])
 if append_eos:
 s = torch.cat([s, eos_item])
 return s

# Construct input for caption task
def construct_sample(video_path):
 
 patch_video = process_video(video_path, max_num_frames=16, num_frames=cfg.task.num_frames, sample_type=cfg.task.sample_type,)
 patch_image = torch.zeros((3, cfg.task.patch_image_size, cfg.task.patch_image_size)) 
 
 patch_type = torch.tensor([1])
 patch_mask = torch.tensor([True])
 src_text = encode_text(" what does the video describe?", append_bos=True, append_eos=True).unsqueeze(0)
 src_length = torch.LongTensor([s.ne(pad_idx).long().sum() for s in src_text])
 sample = {
 "id":np.array(['42']),
 "net_input": {
 "src_tokens": src_text,
 "src_lengths": src_length,
 "patch_videos": patch_video,
 "patch_images": patch_image,
 "patch_masks": patch_mask,
 "patch_types": patch_type,
 }
 }
 return sample
 
# Function to turn FP32 to FP16
def apply_half(t):
 if t.dtype is torch.float32:
 return t.to(dtype=torch.half)
 return t

### Inference

In [157]:
save_dir = '/home/mshukor/ofa_adastra'




video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7019.mp4' # a man is sitting in a chair and talking
# video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7038.mp4' # a person is cooking something in a pan
# video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7021.mp4' # a group of people are playing baseball
# video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7068.mp4' # a man and a woman are talking to each other
# video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7017.mp4' # a person is playing a video game
# video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7014.mp4' # a girl is singing on the voice



# video_path = '/data/mshukor/data/video/msrvtt/examples/video1065.mp4'

# limitations
video_path = '/data/mshukor/data/video/msrvtt/examples/test/video7055.mp4' # a man is driving a car


sample = construct_sample(video_path)
sample = utils.move_to_cuda(sample) if use_cuda else sample
sample = utils.apply_to_sample(apply_half, sample) if use_fp16 else sample

torch.Size([1, 3, 16, 384, 384])


In [158]:
from utils.eval_utils import eval_caption

with torch.no_grad():
 result, scores = eval_caption(task, generator, models, sample)

tensor([1], device='cuda:0')
torch.Size([1, 2048, 1, 12, 12])


In [159]:
caption = result[0]['caption']
print(caption)

from IPython.display import Video
Video(video_path, embed=True)



a man is driving a car
