Doron Adler commited on Dec 2, 2022

Commit

49c27fc

•

1 Parent(s): b32a65a

sd2-cartoon-blip

Browse files

Files changed (17) hide show

README.md +84 -0
feature_extractor/preprocessor_config.json +28 -0
model_index.json +29 -0
scheduler/scheduler_config.json +13 -0
sd2-cartoon-blip-example.py +41 -0
sd2-cartoon-blip.ckpt +3 -0
sd2-cartoon-blip.yaml +67 -0
text_encoder/config.json +25 -0
text_encoder/pytorch_model.bin +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +34 -0
tokenizer/vocab.json +0 -0
unet/config.json +46 -0
unet/diffusion_pytorch_model.bin +3 -0
vae/config.json +30 -0
vae/diffusion_pytorch_model.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+license: creativeml-openrail-m
+language:
+  - en
+thumbnail: "https://huggingface.co/Norod78/Norod78/sd2-cartoon-blip/raw/main/example/Norod78/sd2-cartoon-blip-sample_tile-0.jpg"
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+datasets:
+- Norod78/cartoon-blip-captions
+inference: true
+---
+# Cartoon diffusion v2.0
+*Stable Diffusion v2.0 fine tuned on images from various cartoon shows
+If you want more details on how to generate your own blip cpationed dataset see this [colab](https://colab.research.google.com/gist/Norod/ee6ee3c4bf11c2d2be531d728ec30824/buildimagedatasetwithblipcaptionsanduploadtohf.ipynb)
+Training was done using a slightly modified version of Hugging-Face's text to image training [example script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)
+## About
+Put in a text prompt and generate cartoony images
+## AUTOMATIC1111 webui checkpoint
+The [main](https://huggingface.co/Norod78/Norod78/sd2-cartoon-blip/tree/main) folder contains a .ckpt and a .yaml file to be put in [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) "stable-diffusion-webui/models/Stable-diffusion" folder and used to generate images
+## Sample code
+```py
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+import torch
+# this will substitute the default PNDM scheduler for K-LMS
+lms = LMSDiscreteScheduler(
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear"
+)
+guidance_scale=8.5
+steps=50
+cartoon_model_path = "Norod78/sd2-cartoon-blip"
+cartoon_pipe = StableDiffusionPipeline.from_pretrained(cartoon_model_path, scheduler=lms, torch_dtype=torch.float16)
+cartoon_pipe.to("cuda")
+def generate(prompt, file_prefix ,samples, seed=42):
+    torch.manual_seed(seed)
+    prompt += ", Very detailed, clean, high quality, sharp image"
+    cartoon_images = cartoon_pipe([prompt] * samples, num_inference_steps=steps, guidance_scale=guidance_scale)["images"]
+    for idx, image in enumerate(cartoon_images):
+        image.save(f"{file_prefix}-{idx}-{seed}-sd2-cartoon-blip.jpg")
+generate("An oil on canvas portrait of Snoop Dogg, Mark Ryden", "01_SnoopDog", 2, 777)
+generate("A flemish baroque painting of Kermit from the muppet show", "02_KermitFlemishBaroque", 2, 42)
+generate("Gal Gadot in Avatar", "03_GalGadotAvatar", 2, 777)
+generate("Ninja turtles, Naoto Hattori", "04_TMNT", 2, 312)
+generate("An anime town", "05_AnimeTown", 2, 777)
+generate("Family guy taking selfies at the beach", "06_FamilyGuy", 2, 555)
+generate("Pikachu as Rick and morty, Eric Wallis", "07_PikachuRnM", 2, 777)
+generate("Pikachu as Spongebob, Eric Wallis", "08_PikachuSpongeBob", 2, 42)
+generate("An oil painting of Miss. Piggy from the muppets as the Mona Lisa", "09_MsPiggyMonaLisa", 2, 42)
+generate("Rick Sanchez in star wars, Dave Dorman", "10_RickStarWars", 2, 42)
+generate("An paiting of Southpark with rainbow", "11_Southpark", 2, 777)
+generate("An oil painting of Phineas and Pherb hamering on a new machine, Eric Wallis", "12_PhineasPherb", 2, 777)
+generate("Bender, Saturno Butto", "13_Bender", 2, 777)
+generate("A psychedelic image of Bojack Horseman", "14_Bojack", 2, 777)
+generate("A movie poster for Gravity Falls Cthulhu stories", "15_GravityFalls", 2, 777)
+generate("A vibrant oil painting portrait of She-Ra", "16_Shira", 2, 512)
+#
+```
+![Images generated by this sample code](https://huggingface.co/Norod78/Norod78/sd2-cartoon-blip/resolve/main/example/Norod78/sd2-cartoon-blip-sample_tile-0.jpg)
+![Images generated by this sample code](https://huggingface.co/Norod78/Norod78/sd2-cartoon-blip/resolve/main/example/Norod78/sd2-cartoon-blip-sample_tile-1.jpg)
+## Dataset and Training
+Finetuned for 25,000 iterations upon [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) on [BLIP captioned cartoon images](https://huggingface.co/datasets/Norod78/cartoon-blip-captions) using 1xA5000 GPU on my home desktop computer
+Trained by [@Norod78](https://twitter.com/Norod78)

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.9.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": false,
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.9.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}

sd2-cartoon-blip-example.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+import torch
+# this will substitute the default PNDM scheduler for K-LMS
+lms = LMSDiscreteScheduler(
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear"
+)
+guidance_scale=8.5
+steps=50
+cartoon_model_path = "Norod78/sd2-cartoon-blip"
+cartoon_pipe = StableDiffusionPipeline.from_pretrained(cartoon_model_path, scheduler=lms, torch_dtype=torch.float16)
+cartoon_pipe.to("cuda")
+def generate(prompt, file_prefix ,samples, seed=42):
+    torch.manual_seed(seed)
+    prompt += ", Very detailed, clean, high quality, sharp image"
+    cartoon_images = cartoon_pipe([prompt] * samples, num_inference_steps=steps, guidance_scale=guidance_scale)["images"]
+    for idx, image in enumerate(cartoon_images):
+        image.save(f"{file_prefix}-{idx}-{seed}-sd2-cartoon-blip.jpg")
+generate("An oil on canvas portrait of Snoop Dogg, Mark Ryden", "01_SnoopDog", 2, 777)
+generate("A flemish baroque painting of Kermit from the muppet show", "02_KermitFlemishBaroque", 2, 42)
+generate("Gal Gadot in Avatar", "03_GalGadotAvatar", 2, 777)
+generate("Ninja turtles, Naoto Hattori", "04_TMNT", 2, 312)
+generate("An anime town", "05_AnimeTown", 2, 777)
+generate("Family guy taking selfies at the beach", "06_FamilyGuy", 2, 555)
+generate("Pikachu as Rick and morty, Eric Wallis", "07_PikachuRnM", 2, 777)
+generate("Pikachu as Spongebob, Eric Wallis", "08_PikachuSpongeBob", 2, 42)
+generate("An oil painting of Miss. Piggy from the muppets as the Mona Lisa", "09_MsPiggyMonaLisa", 2, 42)
+generate("Rick Sanchez in star wars, Dave Dorman", "10_RickStarWars", 2, 42)
+generate("An paiting of Southpark with rainbow", "11_Southpark", 2, 777)
+generate("An oil painting of Phineas and Pherb hamering on a new machine, Eric Wallis", "12_PhineasPherb", 2, 777)
+generate("Bender, Saturno Butto", "13_Bender", 2, 777)
+generate("A psychedelic image of Bojack Horseman", "14_Bojack", 2, 777)
+generate("A movie poster for Gravity Falls Cthulhu stories", "15_GravityFalls", 2, 777)
+generate("A vibrant oil painting portrait of She-Ra", "16_Shira", 2, 512)
+#

sd2-cartoon-blip.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cdc8bccd11094dfdeb53abc40ec0f618257fcdacb92245b77455ea03089a78a
+size 2580353150

sd2-cartoon-blip.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "sd2-cartoon-blip-20000itr",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float16",
+  "transformers_version": "4.25.0.dev0",
+  "vocab_size": 49408
+}

text_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f988248177fc727cf066267b3ccb62e7a74b7e3c5b3efa8a701d563e0f0ea037
+size 680901463

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "name_or_path": "sd2-cartoon-blip-20000itr",
+  "pad_token": "<|endoftext|>",
+  "special_tokens_map_file": "./special_tokens_map.json",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.9.0",
+  "_name_or_path": "sd2-cartoon-blip-20000itr",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "use_linear_projection": true
+}

unet/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3e94b81d01af888f60bccd5a5270a6057120f09dcc33c6fd13e8a152a872ce7
+size 3463923045

vae/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.9.0",
+  "_name_or_path": "sd2-cartoon-blip-20000itr",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb128b1f37e0c381c440128b217d29613b3e08b9e4ea7f20466424145ba538b0
+size 167402961