import json import argparse import numpy as np from transformers import (AutoTokenizer, CLIPTextModelWithProjection) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default='./pretrained_models/clip-vit-base-patch32-projection') parser.add_argument('--text', type=str, default='data/captions/coco_class_captions.json') parser.add_argument('--out', type=str, default='output.npy') args = parser.parse_args() tokenizer = AutoTokenizer.from_pretrained(args.model) model = CLIPTextModelWithProjection.from_pretrained(args.model) with open(args.text) as f: data = json.load(f) texts = [x[0] for x in data] device = 'cuda:0' model.to(device) texts = tokenizer(text=texts, return_tensors='pt', padding=True) texts = texts.to(device) text_outputs = model(**texts) txt_feats = text_outputs.text_embeds txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]) np.save(args.out, txt_feats.cpu().data.numpy())