import json
import argparse
import numpy as np
from transformers import (AutoTokenizer, CLIPTextModelWithProjection)


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model',
        type=str,
        default='./pretrained_models/clip-vit-base-patch32-projection')
    parser.add_argument('--text',
                        type=str,
                        default='data/captions/coco_class_captions.json')
    parser.add_argument('--out', type=str, default='output.npy')

    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = CLIPTextModelWithProjection.from_pretrained(args.model)

    with open(args.text) as f:
        data = json.load(f)
    texts = [x[0] for x in data]
    device = 'cuda:0'
    model.to(device)
    texts = tokenizer(text=texts, return_tensors='pt', padding=True)
    texts = texts.to(device)
    text_outputs = model(**texts)
    txt_feats = text_outputs.text_embeds
    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
    txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1])

    np.save(args.out, txt_feats.cpu().data.numpy())