import torch from PIL import Image from torchvision import transforms from transformers import CLIPProcessor, CLIPModel # load the CLIP model device = "cuda" if torch.cuda.is_available() else "cpu" model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") model.to(device) # load the CLIP processor processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # load the image image_path = "path/to/image.jpg" image = Image.open(image_path) # resize the image resize = transforms.Resize((224, 224)) image = resize(image) # convert the image to a tensor tensor = transforms.ToTensor()(image) tensor = tensor.to(device) # get the image features using the CLIP model with torch.no_grad(): features = model.encode_image(tensor.unsqueeze(0)) # generate variations of the image using the CLIP model and processor with torch.no_grad(): outputs = model.generate_images( features=features, num_images=5, # number of different variations to generate max_length=50, # maximum length of the generated caption for the variation clip=processor, temperature=1.0, # temperature of the sampling process top_p=0.9, # top-p probability for the sampling process batch_size=1, device=device, ) # save the generated images for i, output in enumerate(outputs): generated_image = transforms.functional.to_pil_image(output[0]) generated_image.save(f"output/image_variation_{i}.jpg")