#@title import numpy as np import gradio as gr import argparse import itertools import math import os import random import numpy as np import torch import torch.nn.functional as F import torch.utils.checkpoint from torch.utils.data import Dataset import PIL from diffusers import StableDiffusionImg2ImgPipeline # 修正箇所 from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, UNet2DConditionModel from diffusers.hub_utils import init_git_repo, push_to_hub from diffusers.optimization import get_scheduler from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from PIL import Image from torchvision import transforms from tqdm.auto import tqdm from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD') YOUR_TOKEN=MY_SECRET_TOKEN device="cpu" pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4" #@param {type:"string"} from IPython.display import Markdown from huggingface_hub import hf_hub_download #@title Load your concept here #@markdown Enter the `repo_id` for a concept you like (you can find pre-learned concepts in the public [SD Concepts Library](https://huggingface.co/sd-concepts-library)) repo_id_embeds = "sd-concepts-library/mikako-methodi2i" #@param {type:"string"} def image_grid(imgs, rows, cols): assert len(imgs) == rows*cols w, h = imgs[0].size grid = Image.new('RGB', size=(cols*w, rows*h)) grid_w, grid_h = grid.size for i, img in enumerate(imgs): grid.paste(img, box=(i%cols*w, i//cols*h)) return grid #@title Set up the Tokenizer and the Text Encoder tokenizer = CLIPTokenizer.from_pretrained( pretrained_model_name_or_path, subfolder="tokenizer", use_auth_token=YOUR_TOKEN, ) text_encoder = CLIPTextModel.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", use_auth_token=YOUR_TOKEN ) #@title Load the newly learned embeddings into CLIP def load_learned_embed_in_clip(learned_embeds_path, text_encoder, tokenizer, token=None): loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu") # separate token and the embeds trained_token = list(loaded_learned_embeds.keys())[0] embeds = loaded_learned_embeds[trained_token] # cast to dtype of text_encoder dtype = text_encoder.get_input_embeddings().weight.dtype embeds.to(dtype) # add the token in tokenizer token = token if token is not None else trained_token num_added_tokens = tokenizer.add_tokens(token) if num_added_tokens == 0: raise ValueError(f"The tokenizer already contains the token {token}. Please pass a different `token` that is not already in the tokenizer.") # resize the token embeddings text_encoder.resize_token_embeddings(len(tokenizer)) # get the id for the token and assign the embeds token_id = tokenizer.convert_tokens_to_ids(token) text_encoder.get_input_embeddings().weight.data[token_id] = embeds load_learned_embed_in_clip(learned_embeds_path, text_encoder, tokenizer) def crop_center(pil_img, crop_width, crop_height): img_width, img_height = pil_img.size return pil_img.crop(((img_width - crop_width) // 2, (img_height - crop_height) // 2, (img_width + crop_width) // 2, (img_height + crop_height) // 2)) #@title Run the Stable Diffusion pipeline #@markdown Don't forget to use the placeholder token in your prompt from torch import autocast #pipe = StableDiffusionPipeline.from_pretrained( pipe = StableDiffusionImg2ImgPipeline.from_pretrained( pretrained_model_name_or_path, torch_dtype=torch.float16, text_encoder=text_encoder, tokenizer=tokenizer, use_auth_token=True, crop = True, ).to(device) pipe.enable_attention_slicing() def inmm(init_image, prompt): (w,h) = init_image.size if w>h : init_image = init_image.crop(((w - h) // 2,0,(w-h)//2 + h,h)) init_image = init_image.resize(512,512) with autocast(device): image = pipe([prompt], num_inference_steps=50, guidance_scale=7, init_image=init_image)["sample"] return image[0] demo = gr.Interface(inmm, inputs=[gr.Image(shape=(512, 512),type="pil"),gr.Textbox(lines=2, placeholder="どんな絵が欲しいか",value ="a heartwarming and calming landscape drawing in style")], outputs="image", examples=[["a_img.png", "A heartwarming Canadian wheat field scene in style, some houses, silos, and a lake in the distance"], ["c_img.png","A heartwarming Landscape on the lake, scenery mirrored on the lake, in style"]]) demo.launch()