valhalla commited on Dec 20, 2023

Commit

5704d1f

1 Parent(s): fbbb499

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
CITATION.cff +24 -0
README.md +577 -0
assets/4090_bs_1.png +0 -0
assets/4090_bs_8.png +0 -0
assets/a100_bs_1.png +0 -0
assets/a100_bs_8.png +0 -0
assets/collage_full.png +3 -0
assets/collage_small.png +3 -0
assets/glowing_256_1.png +0 -0
assets/glowing_256_2.png +0 -0
assets/glowing_256_3.png +0 -0
assets/glowing_512_1.png +0 -0
assets/glowing_512_2.png +0 -0
assets/glowing_512_3.png +0 -0
assets/image2image_256.png +0 -0
assets/image2image_256_orig.png +0 -0
assets/image2image_512.png +0 -0
assets/image2image_512_orig.png +0 -0
assets/inpainting_256.png +0 -0
assets/inpainting_256_mask.png +0 -0
assets/inpainting_256_orig.png +0 -0
assets/inpainting_512.png +0 -0
assets/inpainting_512_mask.png +0 -0
assets/inpainting_512_orig.jpeg +0 -0
assets/minecraft1.png +0 -0
assets/minecraft2.png +0 -0
assets/minecraft3.png +0 -0
assets/noun1.png +0 -0
assets/noun2.png +0 -0
assets/noun3.png +0 -0
assets/text2image_256.png +0 -0
assets/text2image_512.png +0 -0
model_index.json +24 -0
scheduler/scheduler_config.json +6 -0
text_encoder/config.json +24 -0
text_encoder/model.fp16.safetensors +3 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +38 -0
tokenizer/vocab.json +0 -0
training/A mushroom in [V] style.png +0 -0
training/A woman working on a laptop in [V] style.jpg +3 -0
training/generate_images.py +119 -0
training/training.py +916 -0
transformer/config.json +26 -0
transformer/diffusion_pytorch_model.fp16.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
vqvae/config.json +39 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/collage_full.png filter=lfs diff=lfs merge=lfs -text
+assets/collage_small.png filter=lfs diff=lfs merge=lfs -text
+training/A[[:space:]]woman[[:space:]]working[[:space:]]on[[:space:]]a[[:space:]]laptop[[:space:]]in[[:space:]]\[V\][[:space:]]style.jpg filter=lfs diff=lfs merge=lfs -text

CITATION.cff ADDED Viewed

	@@ -0,0 +1,24 @@

+cff-version: 1.2.0
+title: 'Amused: An open MUSE model'
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Suraj
+    family-names: Patil
+  - given-names: Berman
+    family-names: William
+  - given-names: Patrick
+    family-names: von Platen
+repository-code: 'https://github.com/huggingface/amused'
+keywords:
+  - deep-learning
+  - pytorch
+  - image-generation
+  - text2image
+  - image2image
+  - language-modeling
+  - masked-language-modeling
+license: Apache-2.0
+version: 0.12.1

README.md ADDED Viewed

	@@ -0,0 +1,577 @@

+# amused
+![collage](./assets/collage_small.png)
+<sup><sub>Images cherry-picked from 512 and 256 models. Images are degraded to load faster. See ./assets/collage_full.png for originals</sub></sup>
+[[Paper - TODO]]()
+| Model | Params |
+|-------|--------|
+| [amused-256](https://huggingface.co/huggingface/amused-256) | 603M |
+| [amused-512](https://huggingface.co/huggingface/amused-512) | 608M |
+Amused is a lightweight text to image model based off of the [muse](https://arxiv.org/pdf/2301.00704.pdf) architecture. Amused is particularly useful in applications that require a lightweight and fast model such as generating many images quickly at once.
+Amused is a vqvae token based transformer that can generate an image in fewer forward passes than many diffusion models. In contrast with muse, it uses the smaller text encoder clip instead of t5. Due to its small parameter count and few forward pass generation process, amused can generate many images quickly. This benefit is seen particularly at larger batch sizes.
+## 1. Usage
+### Text to image
+#### 256x256 model
+```python
+import torch
+from diffusers import AmusedPipeline
+pipe = AmusedPipeline.from_pretrained(
+    "huggingface/amused-256", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "cowboy"
+image = pipe(prompt, generator=torch.Generator('cuda').manual_seed(8)).images[0]
+image.save('text2image_256.png')
+```
+![text2image_256](./assets/text2image_256.png)
+#### 512x512 model
+```python
+import torch
+from diffusers import AmusedPipeline
+pipe = AmusedPipeline.from_pretrained(
+    "huggingface/amused-512", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans n fp16
+pipe = pipe.to("cuda")
+prompt = "summer in the mountains"
+image = pipe(prompt, generator=torch.Generator('cuda').manual_seed(2)).images[0]
+image.save('text2image_512.png')
+```
+![text2image_512](./assets/text2image_512.png)
+### Image to image
+#### 256x256 model
+```python
+import torch
+from diffusers import AmusedImg2ImgPipeline
+from diffusers.utils import load_image
+pipe = AmusedImg2ImgPipeline.from_pretrained(
+    "huggingface/amused-256", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "apple watercolor"
+input_image = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/image2image_256_orig.png"
+    )
+    .resize((256, 256))
+    .convert("RGB")
+)
+image = pipe(prompt, input_image, strength=0.7, generator=torch.Generator('cuda').manual_seed(3)).images[0]
+image.save('image2image_256.png')
+```
+![image2image_256_orig](./assets/image2image_256_orig.png) ![image2image_256](./assets/image2image_256.png)
+#### 512x512 model
+```python
+import torch
+from diffusers import AmusedImg2ImgPipeline
+from diffusers.utils import load_image
+pipe = AmusedImg2ImgPipeline.from_pretrained(
+    "huggingface/amused-512", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "winter mountains"
+input_image = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/image2image_512_orig.png"
+    )
+    .resize((512, 512))
+    .convert("RGB")
+)
+image = pipe(prompt, input_image, generator=torch.Generator('cuda').manual_seed(15)).images[0]
+image.save('image2image_512.png')
+```
+![image2image_512_orig](./assets/image2image_512_orig.png) ![image2image_512](./assets/image2image_512.png)
+### Inpainting
+#### 256x256 model
+```python
+import torch
+from diffusers import AmusedInpaintPipeline
+from diffusers.utils import load_image
+from PIL import Image
+pipe = AmusedInpaintPipeline.from_pretrained(
+    "huggingface/amused-256", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "a man with glasses"
+input_image = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/inpainting_256_orig.png"
+    )
+    .resize((256, 256))
+    .convert("RGB")
+)
+mask = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/inpainting_256_mask.png"
+    )
+    .resize((256, 256))
+    .convert("L")
+)
+for seed in range(20):
+    image = pipe(prompt, input_image, mask, generator=torch.Generator('cuda').manual_seed(seed)).images[0]
+    image.save(f'inpainting_256_{seed}.png')
+```
+![inpainting_256_orig](./assets/inpainting_256_orig.png) ![inpainting_256_mask](./assets/inpainting_256_mask.png) ![inpainting_256](./assets/inpainting_256.png)
+#### 512x512 model
+```python
+import torch
+from diffusers import AmusedInpaintPipeline
+from diffusers.utils import load_image
+pipe = AmusedInpaintPipeline.from_pretrained(
+    "huggingface/amused-512", variant="fp16", torch_dtype=torch.float16
+)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "fall mountains"
+input_image = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/inpainting_512_orig.jpeg"
+    )
+    .resize((512, 512))
+    .convert("RGB")
+)
+mask = (
+    load_image(
+        "https://raw.githubusercontent.com/huggingface/amused/main/assets/inpainting_512_mask.png"
+    )
+    .resize((512, 512))
+    .convert("L")
+)
+image = pipe(prompt, input_image, mask, generator=torch.Generator('cuda').manual_seed(0)).images[0]
+image.save('inpainting_512.png')
+```
+![inpainting_512_orig](./assets/inpainting_512_orig.jpeg)
+![inpainting_512_mask](./assets/inpainting_512_mask.png)
+![inpainting_512](./assets/inpainting_512.png)
+## 2. Performance
+Amused inherits performance benefits from original [muse](https://arxiv.org/pdf/2301.00704.pdf).
+1. Parallel decoding: The model follows a denoising schedule that aims to unmask some percent of tokens at each denoising step. At each step, all masked tokens are predicted, and some number of tokens that the network is most confident about are unmasked. Because multiple tokens are predicted at once, we can generate a full 256x256 or 512x512 image in around 12 steps. In comparison, an autoregressive model must predict a single token at a time. Note that a 256x256 image with the 16x downsampled VAE that muse uses will have 256 tokens.
+2. Fewer sampling steps: Compared to many diffusion models, muse requires fewer samples.
+Additionally, amused uses the smaller CLIP as its text encoder instead of T5 compared to muse. Amused is also smaller with ~600M params compared the largest 3B param muse model. Note that being smaller, amused produces comparably lower quality results.
+![a100_bs_1](./assets/a100_bs_1.png)
+![a100_bs_8](./assets/a100_bs_8.png)
+![4090_bs_1](./assets/4090_bs_1.png)
+![4090_bs_8](./assets/4090_bs_8.png)
+### Muse performance knobs
+|                     | Uncompiled Transformer + regular attention | Uncompiled Transformer + flash attention (ms) | Compiled Transformer (ms) | Speed Up |
+|---------------------|--------------------------------------------|-------------------------|----------------------|----------|
+| 256 Batch Size 1    |                594.7                      |         507.7                |    212.1                  |   58%       |
+| 512 Batch Size 1    |                637                      |        547                 |       249.9               |     54%     |
+| 256 Batch Size 8    |                719                      |        628.6                 |        427.8              |    32%      |
+| 512 Batch Size 8    |                  1000                    |         917.7                |       703.6               |    23%      |
+Flash attention is enabled by default in the diffusers codebase through torch `F.scaled_dot_product_attention`
+### torch.compile
+To use torch.compile, simply wrap the transformer in torch.compile i.e.
+```python
+pipe.transformer = torch.compile(pipe.transformer)
+```
+Full snippet:
+```python
+import torch
+from diffusers import AmusedPipeline
+pipe = AmusedPipeline.from_pretrained(
+    "huggingface/amused-256", variant="fp16", torch_dtype=torch.float16
+)
+# HERE use torch.compile
+pipe.transformer = torch.compile(pipe.transformer)
+pipe.vqvae.to(torch.float32)  # vqvae is producing nans in fp16
+pipe = pipe.to("cuda")
+prompt = "cowboy"
+image = pipe(prompt, generator=torch.Generator('cuda').manual_seed(8)).images[0]
+image.save('text2image_256.png')
+```
+## 3. Training
+Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
+All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).
+### Finetuning the 256 checkpoint
+These examples finetune on this [nouns](https://huggingface.co/datasets/m1guelpf/nouns) dataset.
+Example results:
+![noun1](./assets/noun1.png) ![noun2](./assets/noun2.png) ![noun3](./assets/noun3.png)
+#### Full finetuning
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 750-1000 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      19.7 GB       |
+|    4        |          2                   |     8             |      18.3 GB       |
+|    1        |          8                   |     8             |      17.9 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --pretrained_model_name_or_path huggingface/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+#### Full finetuning + 8 bit adam
+Note that this training config keeps the batch size low and the learning rate high to get results fast with low resources. However, due to 8 bit adam, it will diverge eventually. If you want to train for longer, you will have to up the batch size and lower the learning rate.
+Batch size: 16, Learning rate: 2e-5, Gives decent results in ~750 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      20.1 GB       |
+|    8        |          2                   |      16           |      15.6 GB       |
+|    1        |          16                   |     16            |      10.7 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 2e-5 \
+    --use_8bit_adam \
+    --pretrained_model_name_or_path huggingface/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+#### Full finetuning + lora
+Batch size: 16, Learning rate: 8e-4, Gives decent results in 1000-1250 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      14.1 GB       |
+|    8        |          2                   |      16           |      10.1 GB       |
+|    1        |          16                   |     16            |      6.5 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-4 \
+    --use_lora \
+    --pretrained_model_name_or_path huggingface/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+### Finetuning the 512 checkpoint
+These examples finetune on this [minecraft](https://huggingface.co/monadical-labs/minecraft-preview) dataset.
+Example results:
+![minecraft1](./assets/minecraft1.png) ![minecraft2](./assets/minecraft2.png) ![minecraft3](./assets/minecraft3.png)
+#### Full finetuning
+Batch size: 8, Learning rate: 8e-5, Gives decent results in 500-1000 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      24.2 GB       |
+|    4        |          2                   |     8             |      19.7 GB       |
+|    1        |          8                   |     8             |      16.99 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-5 \
+    --pretrained_model_name_or_path huggingface/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+#### Full finetuning + 8 bit adam
+Batch size: 8, Learning rate: 5e-6, Gives decent results in 500-1000 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      21.2 GB       |
+|    4        |          2                   |     8             |      13.3 GB       |
+|    1        |          8                   |     8             |      9.9 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 5e-6 \
+    --pretrained_model_name_or_path huggingface/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+#### Full finetuning + lora
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 500-1000 steps
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      12.7 GB       |
+|    4        |          2                   |     8             |      9.0 GB       |
+|    1        |          8                   |     8             |      5.6 GB       |
+```sh
+accelerate launch training/training.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --pretrained_model_name_or_path huggingface/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+### Styledrop
+[Styledrop](https://arxiv.org/abs/2306.00983) is an efficient finetuning method for learning a new style from a small number of images. It has an optional first stage to generate human picked additional training samples. The additional training samples can be used to augment the initial images. Our examples exclude the optional additional image selection stage and instead we just finetune on a single image.
+This is our example style image:
+![example](./training/A%20mushroom%20in%20[V]%20style.png)
+#### 256
+Example results:
+![glowing_256_1](./assets/glowing_256_1.png) ![glowing_256_2](./assets/glowing_256_2.png) ![glowing_256_3](./assets/glowing_256_3.png)
+Learning rate: 4e-4, Gives decent results in 1500-2000 steps
+```sh
+accelerate launch ./training/training.py \
+    --output_dir <output path> \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path huggingface/amused-256 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 4e-4 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image './training/A mushroom in [V] style.png' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 256
+```
+#### 512
+Learning rate: 1e-3, Lora alpha 1, Gives decent results in 1500-2000 steps
+Example results:
+![glowing_512_1](./assets/glowing_512_1.png) ![glowing_512_2](./assets/glowing_512_2.png) ![glowing_512_3](./assets/glowing_512_3.png)
+```
+accelerate launch ./training/training.py \
+    --output_dir ../styledrop \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path huggingface/amused-512 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 1e-3 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image './training/A mushroom in [V] style.png' \
+    --max_train_steps 100000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 512 \
+    --lora_alpha 1
+```
+## 4. Acknowledgements
+TODO
+## 5. Citation
+```
+@misc{patil-etal-2023-amused,
+  author = {Suraj Patil and William Berman and Patrick von Platen},
+  title = {Amused: An open MUSE model},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/huggingface/amused}}
+}
+```

assets/4090_bs_1.png ADDED Viewed

assets/4090_bs_8.png ADDED Viewed

assets/a100_bs_1.png ADDED Viewed

assets/a100_bs_8.png ADDED Viewed

assets/collage_full.png ADDED Viewed

Git LFS Details

SHA256: 4ef4ac5ed85051138756ac3d75cd099e748a3c833f604e0238976b35432c34bf
Pointer size: 133 Bytes
Size of remote file: 15.7 MB

assets/collage_small.png ADDED Viewed

Git LFS Details

SHA256: 5110bedd28f9d68eed175f8234a53c807dedee1f015d54913a1e2758c83e58c2
Pointer size: 132 Bytes
Size of remote file: 1.5 MB

assets/glowing_256_1.png ADDED Viewed

assets/glowing_256_2.png ADDED Viewed

assets/glowing_256_3.png ADDED Viewed

assets/glowing_512_1.png ADDED Viewed

assets/glowing_512_2.png ADDED Viewed

assets/glowing_512_3.png ADDED Viewed

assets/image2image_256.png ADDED Viewed

assets/image2image_256_orig.png ADDED Viewed

assets/image2image_512.png ADDED Viewed

assets/image2image_512_orig.png ADDED Viewed

assets/inpainting_256.png ADDED Viewed

assets/inpainting_256_mask.png ADDED Viewed

assets/inpainting_256_orig.png ADDED Viewed

assets/inpainting_512.png ADDED Viewed

assets/inpainting_512_mask.png ADDED Viewed

assets/inpainting_512_orig.jpeg ADDED Viewed

assets/minecraft1.png ADDED Viewed

assets/minecraft2.png ADDED Viewed

assets/minecraft3.png ADDED Viewed

assets/noun1.png ADDED Viewed

assets/noun2.png ADDED Viewed

assets/noun3.png ADDED Viewed

assets/text2image_256.png ADDED Viewed

assets/text2image_512.png ADDED Viewed

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "AmusedPipeline",
+  "_diffusers_version": "0.25.0.dev0",
+  "scheduler": [
+    "diffusers",
+    "AmusedScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModelWithProjection"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "transformer": [
+    "diffusers",
+    "UVit2DModel"
+  ],
+  "vqvae": [
+    "diffusers",
+    "VQModel"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_class_name": "AmusedScheduler",
+  "_diffusers_version": "0.25.0.dev0",
+  "mask_token_id": 8255,
+  "masking_schedule": "cosine"
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "CLIPTextModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "vocab_size": 49408
+}

text_encoder/model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:549d39f40a16f8ef48ed56da60cd25a467bd2c70866f4d49196829881b13b7b2
+size 247323896

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3149fc6737da29cf39176b86ce19537aea6228082c21a8b4f87ae8ee81681a79
+size 494625272

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "!",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/A mushroom in [V] style.png ADDED Viewed

training/A woman working on a laptop in [V] style.jpg ADDED Viewed

Git LFS Details

SHA256: f07fe073d140d6dc2d4af9609ba73ba4750f46aa2304d2ffc171989d8c4fba78
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

training/generate_images.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import argparse
+import logging
+from diffusers import AmusedPipeline
+import os
+from peft import PeftModel
+from diffusers import UVit2DModel
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument("--style_descriptor", type=str, default="[V]")
+    parser.add_argument(
+        "--load_transformer_from",
+        type=str,
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "--load_transformer_lora_from",
+        type=str,
+        required=False,
+        default=None,
+    )
+    parser.add_argument("--device", type=str, default='cuda')
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--write_images_to", type=str, required=True)
+    args = parser.parse_args()
+    return args
+def main(args):
+    prompts = [
+        f"A chihuahua in {args.style_descriptor} style",
+        f"A tabby cat in {args.style_descriptor} style",
+        f"A portrait of chihuahua in {args.style_descriptor} style",
+        f"An apple on the table in {args.style_descriptor} style",
+        f"A banana on the table in {args.style_descriptor} style",
+        f"A church on the street in {args.style_descriptor} style",
+        f"A church in the mountain in {args.style_descriptor} style",
+        f"A church in the field in {args.style_descriptor} style",
+        f"A church on the beach in {args.style_descriptor} style",
+        f"A chihuahua walking on the street in {args.style_descriptor} style",
+        f"A tabby cat walking on the street in {args.style_descriptor} style",
+        f"A portrait of tabby cat in {args.style_descriptor} style",
+        f"An apple on the dish in {args.style_descriptor} style",
+        f"A banana on the dish in {args.style_descriptor} style",
+        f"A human walking on the street in {args.style_descriptor} style",
+        f"A temple on the street in {args.style_descriptor} style",
+        f"A temple in the mountain in {args.style_descriptor} style",
+        f"A temple in the field in {args.style_descriptor} style",
+        f"A temple on the beach in {args.style_descriptor} style",
+        f"A chihuahua walking in the forest in {args.style_descriptor} style",
+        f"A tabby cat walking in the forest in {args.style_descriptor} style",
+        f"A portrait of human face in {args.style_descriptor} style",
+        f"An apple on the ground in {args.style_descriptor} style",
+        f"A banana on the ground in {args.style_descriptor} style",
+        f"A human walking in the forest in {args.style_descriptor} style",
+        f"A cabin on the street in {args.style_descriptor} style",
+        f"A cabin in the mountain in {args.style_descriptor} style",
+        f"A cabin in the field in {args.style_descriptor} style",
+        f"A cabin on the beach in {args.style_descriptor} style"
+    ]
+    logger.warning(f"generating image for {prompts}")
+    logger.warning(f"loading models")
+    pipe_args = {}
+    if args.load_transformer_from is not None:
+        pipe_args["transformer"] = UVit2DModel.from_pretrained(args.load_transformer_from)
+    pipe = AmusedPipeline.from_pretrained(
+        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        **pipe_args
+    )
+    if args.load_transformer_lora_from is not None:
+        pipe.transformer = PeftModel.from_pretrained(
+            pipe.transformer, os.path.join(args.load_transformer_from), is_trainable=False
+        )
+    pipe.to(args.device)
+    logger.warning(f"generating images")
+    os.makedirs(args.write_images_to, exist_ok=True)
+    for prompt_idx in range(0, len(prompts), args.batch_size):
+        images = pipe(prompts[prompt_idx:prompt_idx+args.batch_size]).images
+        for image_idx, image in enumerate(images):
+            prompt = prompts[prompt_idx+image_idx]
+            image.save(os.path.join(args.write_images_to, prompt + ".png"))
+if __name__ == "__main__":
+    main(parse_args())

training/training.py ADDED Viewed

	@@ -0,0 +1,916 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import nullcontext
+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from datasets import load_dataset
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from peft import LoraConfig, PeftModel, get_peft_model
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import DataLoader, Dataset, default_collate
+from torchvision import transforms
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+import diffusers.optimization
+from diffusers import AmusedPipeline, AmusedScheduler, EMAModel, UVit2DModel, VQModel
+from diffusers.utils import is_wandb_available
+if is_wandb_available():
+    import wandb
+logger = get_logger(__name__, log_level="INFO")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--instance_data_dataset",
+        type=str,
+        default=None,
+        required=False,
+        help="A Hugging Face dataset containing the training images",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--instance_data_image",
+        type=str,
+        default=None,
+        required=False,
+        help="A single training image"
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--ema_decay", type=float, default=0.9999)
+    parser.add_argument("--ema_update_after_step", type=int, default=0)
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="muse_training",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=50,
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=0.0003,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--validation_prompts", type=str, nargs="*")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument("--split_vae_encode", type=int, required=False, default=None)
+    parser.add_argument("--min_masking_rate", type=float, default=0.0)
+    parser.add_argument("--cond_dropout_prob", type=float, default=0.0)
+    parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.", required=False)
+    parser.add_argument("--use_lora", action="store_true", help="TODO")
+    parser.add_argument("--lora_r", default=16, type=int)
+    parser.add_argument("--lora_alpha", default=32, type=int)
+    parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+")
+    parser.add_argument("--train_text_encoder", action="store_true")
+    parser.add_argument("--image_key", type=str, required=False)
+    parser.add_argument("--prompt_key", type=str, required=False)
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument("--prompt_prefix", type=str, required=False, default=None)
+    args = parser.parse_args()
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+    num_datasources = sum([x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]])
+    if num_datasources != 1:
+        raise ValueError("provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`")
+    if args.instance_data_dir is not None:
+        if not os.path.exists(args.instance_data_dir):
+            raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}")
+    if args.instance_data_image is not None:
+        if not os.path.exists(args.instance_data_image):
+            raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}")
+    if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None):
+        raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`")
+    return args
+class InstanceDataRootDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_root,
+        tokenizer,
+        size=512,
+    ):
+        self.size = size
+        self.tokenizer = tokenizer
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+    def __len__(self):
+        return len(self.instance_images_path)
+    def __getitem__(self, index):
+        image_path = self.instance_images_path[index % len(self.instance_images_path)]
+        instance_image = Image.open(image_path)
+        rv = process_image(instance_image, self.size)
+        prompt = os.path.splitext(os.path.basename(image_path))[0]
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+        return rv
+class InstanceDataImageDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_image,
+        train_batch_size,
+        size=512,
+    ):
+        self.value = process_image(Image.open(instance_data_image), size)
+        self.train_batch_size = train_batch_size
+    def __len__(self):
+        # Needed so a full batch of the data can be returned. Otherwise will return
+        # batches of size 1
+        return self.train_batch_size
+    def __getitem__(self, index):
+        return self.value
+class HuggingFaceDataset(Dataset):
+    def __init__(
+        self,
+        hf_dataset,
+        tokenizer,
+        image_key,
+        prompt_key,
+        prompt_prefix=None,
+        size=512,
+    ):
+        self.size = size
+        self.image_key = image_key
+        self.prompt_key = prompt_key
+        self.tokenizer = tokenizer
+        self.hf_dataset = hf_dataset
+        self.prompt_prefix = prompt_prefix
+    def __len__(self):
+        return len(self.hf_dataset)
+    def __getitem__(self, index):
+        item = self.hf_dataset[index]
+        rv = process_image(item[self.image_key], self.size)
+        prompt = item[self.prompt_key]
+        if self.prompt_prefix is not None:
+            prompt = self.prompt_prefix + prompt
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+        return rv
+def process_image(image, size):
+    image = exif_transpose(image)
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    orig_height = image.height
+    orig_width = image.width
+    image = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)(image)
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(size, size))
+    image = transforms.functional.crop(image, c_top, c_left, size, size)
+    image = transforms.ToTensor()(image)
+    micro_conds = torch.tensor(
+        [
+            orig_width,
+            orig_height,
+            c_top,
+            c_left,
+            6.0
+        ],
+    )
+    return {"image": image, "micro_conds": micro_conds}
+@torch.no_grad()
+def tokenize_prompt(tokenizer, prompt):
+    return tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=77,
+        return_tensors="pt",
+    ).input_ids
+def encode_prompt(text_encoder, input_ids):
+    outputs = text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+    encoder_hidden_states = outputs.hidden_states[-2]
+    cond_embeds = outputs[0]
+    return encoder_hidden_states, cond_embeds
+def main(args):
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    if accelerator.is_main_process:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_main_process:
+        accelerator.init_trackers("amused", config=vars(copy.deepcopy(args)))
+    if args.seed is not None:
+        set_seed(args.seed)
+    resume_from_checkpoint = args.resume_from_checkpoint
+    if resume_from_checkpoint:
+        if resume_from_checkpoint == "latest":
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            if len(dirs) > 0:
+                resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1])
+            else:
+                resume_from_checkpoint = None
+        if resume_from_checkpoint is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+        else:
+            accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}")
+    # TODO - will have to fix loading if training text encoder
+    text_encoder = CLIPTextModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, variant=args.variant
+    )
+    vq_model = VQModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vqvae", revision=args.revision, variant=args.variant
+    )
+    if args.train_text_encoder:
+        text_encoder.train()
+        text_encoder.requires_grad_(True)
+    else:
+        text_encoder.eval()
+        text_encoder.requires_grad_(False)
+    vq_model.requires_grad_(False)
+    if args.use_lora:
+        model = UVit2DModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+        )
+        if resume_from_checkpoint is not None:
+            model = PeftModel.from_pretrained(
+                model, os.path.join(resume_from_checkpoint, "transformer"), is_trainable=True
+            )
+        else:
+            lora_config = LoraConfig(
+                r=args.lora_r,
+                lora_alpha=args.lora_alpha,
+                target_modules=args.lora_target_modules,
+            )
+            model = get_peft_model(model, lora_config)
+    else:
+        if resume_from_checkpoint is not None:
+            model = UVit2DModel.from_pretrained(resume_from_checkpoint, subfolder="transformer")
+        else:
+            model = UVit2DModel.from_pretrained(
+                args.pretrained_model_name_or_path,
+                subfolder="transformer",
+                revision=args.revision,
+                variant=args.variant,
+            )
+    model.train()
+    if args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+    if args.use_ema:
+        if resume_from_checkpoint is not None:
+            ema = EMAModel.from_pretrained(os.path.join(resume_from_checkpoint, "ema_model"), model_cls=UVit2DModel)
+        else:
+            ema = EMAModel(
+                model.parameters(),
+                decay=args.ema_decay,
+                update_after_step=args.ema_update_after_step,
+                model_cls=UVit2DModel,
+                model_config=model.config,
+            )
+    # TODO - this will save the lora weights in the peft format. We want to save in
+    # diffusers format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                if isinstance(model, UVit2DModel):
+                    models[0].save_pretrained(os.path.join(output_dir, "transformer"))
+                elif isinstance(model, CLIPTextModelWithProjection):
+                    models[0].save_pretrained(os.path.join(output_dir, "text_encoder"))
+                weights.pop()
+            if args.use_ema:
+                ema.save_pretrained(os.path.join(output_dir, "ema_model"))
+    def load_model_hook(models, input_dir):
+        # All models are initially instantiated from the checkpoint and so
+        # don't have to be loaded in the accelerate hook
+        for _ in range(len(models)):
+            models.pop()
+    accelerator.register_load_state_pre_hook(load_model_hook)
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        )
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    # no decay on bias and layernorm and embedding
+    no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.adam_weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    # TODO - does not actually take text encoder parameters
+    optimizer = optimizer_cls(
+        optimizer_grouped_parameters,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    logger.info("Creating dataloaders and lr_scheduler")
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    if args.instance_data_dir is not None:
+        dataset = InstanceDataRootDataset(
+            instance_data_root=args.instance_data_dir,
+            tokenizer=tokenizer,
+            size=args.resolution,
+        )
+    elif args.instance_data_image is not None:
+        dataset = InstanceDataImageDataset(
+            instance_data_image=args.instance_data_image,
+            train_batch_size=args.train_batch_size,
+            size=args.resolution,
+        )
+    elif args.instance_data_dataset is not None:
+        dataset = HuggingFaceDataset(
+            hf_dataset=load_dataset(args.instance_data_dataset, split="train"),
+            tokenizer=tokenizer,
+            image_key=args.image_key,
+            prompt_key=args.prompt_key,
+            prompt_prefix=args.prompt_prefix,
+            size=args.resolution,
+        )
+    else:
+        assert False
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        num_workers=args.dataloader_num_workers,
+        collate_fn=default_collate,
+    )
+    train_dataloader.num_batches = len(train_dataloader)
+    lr_scheduler = diffusers.optimization.get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_training_steps=args.max_train_steps*accelerator.num_processes,
+        num_warmup_steps=args.lr_warmup_steps*accelerator.num_processes,
+    )
+    logger.info("Preparing model, optimizer and dataloaders")
+    if args.train_text_encoder:
+        model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader, text_encoder
+        )
+    else:
+        model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader
+        )
+    train_dataloader.num_batches = len(train_dataloader)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if not args.train_text_encoder:
+        text_encoder.to(device=accelerator.device, dtype=weight_dtype)
+    vq_model.to(device=accelerator.device)
+    if args.use_ema:
+        ema.to(accelerator.device)
+    with nullcontext() if args.train_text_encoder else torch.no_grad():
+        empty_embeds, empty_clip_embeds = encode_prompt(text_encoder, tokenize_prompt(tokenizer, "").to(text_encoder.device, non_blocking=True))
+        # There is a single image, we can just pre-encode the single prompt
+        if args.instance_data_image is not None:
+            prompt = os.path.splitext(os.path.basename(args.instance_data_image))[0]
+            encoder_hidden_states, cond_embeds = encode_prompt(text_encoder, tokenize_prompt(tokenizer, prompt).to(text_encoder.device, non_blocking=True))
+            encoder_hidden_states = encoder_hidden_states.repeat(args.train_batch_size, 1, 1)
+            cond_embeds = cond_embeds.repeat(args.train_batch_size, 1)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs.
+    # Note: We are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info(f"  Num training steps = {args.max_train_steps}")
+    logger.info(f"  Instantaneous batch size per device = { args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    if resume_from_checkpoint is None:
+        global_step = 0
+        first_epoch = 0
+    else:
+        accelerator.load_state(resume_from_checkpoint)
+        global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1])
+        first_epoch = global_step // num_update_steps_per_epoch
+    # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    for epoch in range(first_epoch, num_train_epochs):
+        for batch in train_dataloader:
+            with torch.no_grad():
+                micro_conds = batch["micro_conds"].to(accelerator.device, non_blocking=True)
+                pixel_values = batch["image"].to(accelerator.device, non_blocking=True)
+                batch_size = pixel_values.shape[0]
+                split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size
+                num_splits = math.ceil(batch_size / split_batch_size)
+                image_tokens = []
+                for i in range(num_splits):
+                    start_idx = i * split_batch_size
+                    end_idx = min((i + 1) * split_batch_size, batch_size)
+                    bs = pixel_values.shape[0]
+                    image_tokens.append(
+                        vq_model.quantize(vq_model.encode(pixel_values[start_idx:end_idx]).latents)[2][2].reshape(
+                            bs, -1
+                        )
+                    )
+                image_tokens = torch.cat(image_tokens, dim=0)
+                batch_size, seq_len = image_tokens.shape
+                timesteps = torch.rand(batch_size, device=image_tokens.device)
+                mask_prob = torch.cos(timesteps * math.pi * 0.5)
+                mask_prob = mask_prob.clip(args.min_masking_rate)
+                num_token_masked = (seq_len * mask_prob).round().clamp(min=1)
+                batch_randperm = torch.rand(batch_size, seq_len, device=image_tokens.device).argsort(dim=-1)
+                mask = batch_randperm < num_token_masked.unsqueeze(-1)
+                mask_id = accelerator.unwrap_model(model).config.vocab_size - 1
+                input_ids = torch.where(mask, mask_id, image_tokens)
+                labels = torch.where(mask, image_tokens, -100)
+                if args.cond_dropout_prob > 0.0:
+                    assert encoder_hidden_states is not None
+                    batch_size = encoder_hidden_states.shape[0]
+                    mask = (
+                        torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1)
+                        < args.cond_dropout_prob
+                    )
+                    empty_embeds_ = empty_embeds.expand(batch_size, -1, -1)
+                    encoder_hidden_states = torch.where(
+                        (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_
+                    )
+                    empty_clip_embeds_ = empty_clip_embeds.expand(batch_size, -1)
+                    cond_embeds = torch.where((cond_embeds * mask.squeeze(-1)).bool(), cond_embeds, empty_clip_embeds_)
+                bs = input_ids.shape[0]
+                vae_scale_factor = 2 ** (len(vq_model.config.block_out_channels) - 1)
+                resolution = args.resolution // vae_scale_factor
+                input_ids = input_ids.reshape(bs, resolution, resolution)
+            if "prompt_input_ids" in batch:
+                with nullcontext() if args.train_text_encoder else torch.no_grad():
+                    encoder_hidden_states, cond_embeds = encode_prompt(text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True))
+            # Train Step
+            with accelerator.accumulate(model):
+                codebook_size = accelerator.unwrap_model(model).config.codebook_size
+                logits = (
+                    model(
+                        input_ids=input_ids,
+                        encoder_hidden_states=encoder_hidden_states,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=cond_embeds,
+                    )
+                    .reshape(bs, codebook_size, -1)
+                    .permute(0, 2, 1)
+                    .reshape(-1, codebook_size)
+                )
+                loss = F.cross_entropy(
+                    logits,
+                    labels.view(-1),
+                    ignore_index=-100,
+                    reduction="mean",
+                )
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean()
+                accelerator.backward(loss)
+                if args.max_grad_norm is not None and accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema.step(model.parameters())
+                if (global_step + 1) % args.logging_steps == 0:
+                    logs = {
+                        "step_loss": avg_loss.item(),
+                        "lr": lr_scheduler.get_last_lr()[0],
+                        "avg_masking_rate": avg_masking_rate.item(),
+                    }
+                    accelerator.log(logs, step=global_step + 1)
+                    logger.info(
+                        f"Step: {global_step + 1} "
+                        f"Loss: {avg_loss.item():0.4f} "
+                        f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}"
+                    )
+                if (global_step + 1) % args.checkpointing_steps == 0:
+                    save_checkpoint(args, accelerator, global_step + 1)
+                if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process:
+                    if args.use_ema:
+                        ema.store(model.parameters())
+                        ema.copy_to(model.parameters())
+                    with torch.no_grad():
+                        logger.info("Generating images...")
+                        model.eval()
+                        if args.train_text_encoder:
+                            text_encoder.eval()
+                        scheduler = AmusedScheduler.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            subfolder="scheduler",
+                            revision=args.revision,
+                            variant=args.variant,
+                        )
+                        pipe = AmusedPipeline(
+                            transformer=accelerator.unwrap_model(model),
+                            tokenizer=tokenizer,
+                            text_encoder=text_encoder,
+                            vqvae=vq_model,
+                            scheduler=scheduler,
+                        )
+                        pil_images = pipe(prompt=args.validation_prompts).images
+                        wandb_images = [
+                            wandb.Image(image, caption=args.validation_prompts[i])
+                            for i, image in enumerate(pil_images)
+                        ]
+                        wandb.log({"generated_images": wandb_images}, step=global_step + 1)
+                        model.train()
+                        if args.train_text_encoder:
+                            text_encoder.train()
+                    if args.use_ema:
+                        ema.restore(model.parameters())
+                global_step += 1
+            # Stop training if max steps is reached
+            if global_step >= args.max_train_steps:
+                break
+        # End for
+    accelerator.wait_for_everyone()
+    # Evaluate and save checkpoint at the end of training
+    save_checkpoint(args, accelerator, global_step)
+    # Save the final trained checkpoint
+    if accelerator.is_main_process:
+        model = accelerator.unwrap_model(model)
+        if args.use_ema:
+            ema.copy_to(model.parameters())
+        model.save_pretrained(args.output_dir)
+    accelerator.end_training()
+def save_checkpoint(args, accelerator, global_step):
+    output_dir = args.output_dir
+    # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+    if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+        checkpoints = os.listdir(output_dir)
+        checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+        checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+        # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+        if len(checkpoints) >= args.checkpoints_total_limit:
+            num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+            removing_checkpoints = checkpoints[0:num_to_remove]
+            logger.info(
+                f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+            )
+            logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+            for removing_checkpoint in removing_checkpoints:
+                removing_checkpoint = os.path.join(output_dir, removing_checkpoint)
+                shutil.rmtree(removing_checkpoint)
+    save_path = Path(output_dir) / f"checkpoint-{global_step}"
+    accelerator.save_state(save_path)
+    logger.info(f"Saved state to {save_path}")
+if __name__ == "__main__":
+    main(parse_args())

transformer/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_class_name": "UVit2DModel",
+  "_diffusers_version": "0.25.0.dev0",
+  "attention_dropout": 0.0,
+  "block_num_heads": 12,
+  "block_out_channels": 768,
+  "codebook_size": 8192,
+  "cond_embed_dim": 768,
+  "downsample": true,
+  "encoder_hidden_size": 768,
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "in_channels": 768,
+  "intermediate_size": 2816,
+  "layer_norm_eps": 1e-06,
+  "ln_elementwise_affine": true,
+  "micro_cond_embed_dim": 1280,
+  "micro_cond_encode_dim": 256,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 22,
+  "num_res_blocks": 3,
+  "sample_size": 32,
+  "upsample": true,
+  "use_bias": false,
+  "vocab_size": 8256
+}

transformer/diffusion_pytorch_model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7e64c3b198ac7337c3d4bde158dd542976b204b35f9d420b918f231e9e8c6ce
+size 1216570696

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb06f34bbaae81d7fce1ae7165c3062fa76cdfcc163c32180bfc26013f70672
+size 2433086672

vqvae/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.25.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+    512,
+    768
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 64,
+  "layers_per_block": 2,
+  "lookup_from_codebook": true,
+  "mid_block_add_attention": false,
+  "norm_num_groups": 32,
+  "norm_type": "group",
+  "num_vq_embeddings": 8192,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": null,
+  "force_upcast": true
+}