import open_clip import torch from PIL import Image device = "cuda" if torch.cuda.is_available() else "cpu" model, _, transform = open_clip.create_model_and_transforms( model_name="coca_biogpt_vitb16", pretrained="coca_biogpt_vitb16.pt", ) model.to(device) model.eval() nb = 1 path = "example.png" im = Image.open(path).convert("RGB") im = transform(im).unsqueeze(0) im = im.to(device) im = im.repeat(nb,1,1,1) print(im.shape) tokenizer = open_clip.get_tokenizer("coca_biogpt_vitb16") print(tokenizer.tokenizer) with torch.no_grad(): generated = model.generate( im, pad_token_id=1, eos_token_id=2, sot_token_id=0, max_seq_len=256, seq_len=60, # generation_type='top_p', #generation_type='top_k', generation_type='beam_search', #repetition_penalty=1., #top_k=100, #top_p=0.1, # text=text, #temperature=1.0, #min_seq_len=40, ) print(generated) for i in range(nb): if hasattr(tokenizer, "tokenizer"): print(tokenizer.tokenizer.decode(generated[i])) else: print(open_clip.decode(generated[i]))