from base64 import b64encode import numpy import torch from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel from huggingface_hub import notebook_login # For video display: from IPython.display import HTML from matplotlib import pyplot as plt from pathlib import Path from PIL import Image from torch import autocast from torchvision import transforms as tfms from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer, logging import os torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" #torch_device = "cpu" # Load the autoencoder model which will be used to decode the latents into image space. vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") # Load the tokenizer and text encoder to tokenize and encode the text. tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") # The UNet model for generating the latents. unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet") # The noise scheduler scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) # To the GPU we go! vae = vae.to(torch_device) text_encoder = text_encoder.to(torch_device) unet = unet.to(torch_device); token_emb_layer = text_encoder.text_model.embeddings.token_embedding pos_emb_layer = text_encoder.text_model.embeddings.position_embedding position_ids = text_encoder.text_model.embeddings.position_ids[:, :77] position_embeddings = pos_emb_layer(position_ids)