import os import requests from PIL import Image import torch from torchvision import transforms from transformers import ( VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, BlipProcessor, BlipForConditionalGeneration, ) from diffusers import ( DiffusionPipeline, StableDiffusionPipeline, StableDiffusionImageVariationPipeline, ) def generate_image_caption(image_path): # Diffusion pipeline device = torch.device("cpu") os.environ["CUDA_LAUNCH_BLOCKING"] = "1" sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers", revision="v2.0" ) sd_pipe = sd_pipe.to(device) pipeline = DiffusionPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers" ) # Image transformations img_transforms = transforms.Compose( [ transforms.ToTensor(), transforms.Resize( (224, 224), interpolation=transforms.InterpolationMode.BICUBIC, antialias=False, ), transforms.Normalize( [0.5, 0.5, 0.5], [0.5, 0.5, 0.5] ), ] ) # Image-to-image with Image.open(image_path) as img: img_tensor = img_transforms(img).to(device).unsqueeze(0) out = sd_pipe(img_tensor, guidance_scale=3) out["images"][0].save("img1.jpg") generate_image_caption("C:\Master\First.jpg")