import clip
import numpy as np
import torch
from mmaction.datasets.transforms import (CenterCrop, DecordDecode, DecordInit,
                                          FormatShape, Resize)
from torchvision import transforms


def extract_clip_feature_single_video_fps(
        video_path: str,
        clip_ckpt_path: str = 'ViT-L-14.pt',
        device: str = 'cuda'):

    class SampleFrames1FPS(object):
        '''Sample frames at 1 fps.

        Required Keys:
            - total_frames
            - start_index
            - avg_fps

        Added Keys:
            - frame_interval
            - frame_inds
            - num_clips
        '''

        def transform(self, video_info: dict) -> dict:
            video_info['frame_inds'] = np.arange(
                video_info['start_index'],
                video_info['total_frames'],
                video_info['avg_fps'],
                dtype=int)  # np.arange(start, stop, step, dtype)
            video_info['frame_interval'] = 1
            video_info['num_clips'] = len(video_info['frame_inds'])
            return video_info

    class SampleFrames5FPS(object):
        '''Sample frames at 5 fps.

        Required Keys:
            - total_frames
            - start_index
            - avg_fps

        Added Keys:
            - frame_interval
            - frame_inds
            - num_clips
        '''

        def transform(self, video_info: dict) -> dict:
            video_info['frame_inds'] = np.arange(
                video_info['start_index'],
                video_info['total_frames'],
                video_info['avg_fps'] // 5,
                dtype=int)
            video_info['frame_interval'] = 1
            video_info['num_clips'] = len(video_info['frame_inds'])
            return video_info

    video_info = {'filename': video_path, 'start_index': 0}
    video_processors = [
        DecordInit(),
        SampleFrames1FPS(),
        DecordDecode(),
        Resize(scale=(-1, 224)),
        CenterCrop(crop_size=224),
        FormatShape(input_format='NCHW'),
    ]

    # decode video to imgs
    for processor in video_processors:
        video_info = processor.transform(video_info)

    imgs = torch.from_numpy(video_info['imgs'])  # uint8 img tensor

    imgs_transforms = transforms.Compose([
        transforms.ConvertImageDtype(dtype=torch.float32),
        transforms.Normalize(
            mean=(0.48145466, 0.4578275, 0.40821073),
            std=(0.26862954, 0.26130258, 0.27577711),
            inplace=False)
    ])

    # uint8 -> float, then normalize
    imgs = imgs_transforms(imgs).to(device)

    # load model
    clip_model, _ = clip.load(clip_ckpt_path, device)

    # encode imgs get features
    with torch.no_grad():
        video_feat = clip_model.encode_image(imgs)

    return video_feat, video_info


if __name__ == '__main__':

    device = "cuda" if torch.cuda.is_available() else "cpu"

    video_names = [
        'cook.mp4', 'latex.mp4', 'nba.mp4', 'temple_of_heaven.mp4',
        'south_pole.mp4', 'tv_series.mp4', 'formula_one.mp4', 'make-up.mp4',
        'police.mp4'
    ]
    video_dir = '/mnt/petrelfs/wangyiqin/vid_cap/examples/videos/'

    for video_name in video_names:
        video_feat = extract_clip_feature_single_video_fps(
            video_path=video_dir + video_name,
            clip_ckpt_path='ViT-L-14.pt',
            device=device)
        video_feat = video_feat.cpu()
        # compress to one dimension
        video_feat = video_feat.numpy()

        np.save('clip_features/20/' + video_name[:-4] + '.npy', video_feat)
        print(video_feat.shape)
        print(video_name + ' DONE')