Spaces:

zideliu
/

styledrop

Sleeping

App Files Files Community

zideliu commited on Jul 5, 2023

Commit

28c6826

1 Parent(s): 3104f87

StyleDrop init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +5 -0
Dockerfile +57 -0
README copy.md +13 -0
app.py +264 -0
assets/contexts/empty_context.npy +3 -0
assets/fid_stats/fid_stats_cc3m_val.npz +3 -0
assets/fid_stats/fid_stats_imagenet256_guided_diffusion.npz +3 -0
assets/pipeline.png +0 -0
configs/cc3m_xl_vqf16_jax_2048bs_featset_CLIP_G.py +92 -0
configs/custom.py +83 -0
configs/imagenet256_base_vq_jax.py +84 -0
configs/vae_configs/vq-f16-jax.yaml +42 -0
custom/custom_dataset.py +233 -0
data/data.json +22 -0
data/image_01_01.jpg +3 -0
data/image_01_02.jpg +3 -0
data/image_01_03.jpg +3 -0
data/image_01_04.jpg +3 -0
data/image_01_05.jpg +3 -0
data/image_01_06.jpg +3 -0
data/image_01_07.jpg +3 -0
data/image_01_08.jpg +3 -0
data/image_02_01.jpg +3 -0
data/image_02_02.jpg +3 -0
data/image_02_03.jpg +3 -0
data/image_02_04.jpg +3 -0
data/image_02_05.jpg +3 -0
data/image_02_06.jpg +3 -0
data/image_03_01.jpg +3 -0
data/image_03_03.jpg +3 -0
data/image_03_04.jpg +3 -0
data/image_03_05.jpg +3 -0
data/image_03_07.jpg +3 -0
data/image_03_08.jpg +3 -0
data/one_style.json +3 -0
libs/__init__.py +1 -0
libs/muse.py +107 -0
libs/uvit_t2i_vq.py +282 -0
libs/uvit_vq.py +264 -0
open_clip/__init__.py +13 -0
open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
open_clip/coca_model.py +458 -0
open_clip/constants.py +2 -0
open_clip/factory.py +366 -0
open_clip/generation_utils.py +0 -0
open_clip/hf_configs.py +45 -0
open_clip/hf_model.py +176 -0
open_clip/loss.py +212 -0
open_clip/model.py +445 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__
+*.ckpt
+assets/ckpts
+__pycache__/
+*.sh

Dockerfile ADDED Viewed

	@@ -0,0 +1,57 @@

+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    # ffmpeg \
+    ffmpeg \
+    x264 \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ENV PYTHON_VERSION=3.8.16
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel
+RUN pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1
+COPY --chown=1000 requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
+COPY --chown=1000 . ${HOME}/app
+# RUN cd Tune-A-Video && patch -p1 < ../patch
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+CMD ["python", "app.py"]

README copy.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: StyleDrop Pytorch
+emoji: 📊
+colorFrom: purple
+colorTo: pink
+sdk: gradio
+sdk_version: 3.35.2
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import gradio as gr
+import open_clip
+import torch
+import taming.models.vqgan
+import ml_collections
+import einops
+import random
+import pathlib
+import subprocess
+import shlex
+import wget
+# Model
+from libs.muse import MUSE
+import utils
+import numpy as np
+from PIL import Image
+print("cuda available:",torch.cuda.is_available())
+print("cuda device count:",torch.cuda.device_count())
+print("cuda device name:",torch.cuda.get_device_name(0))
+print(os.system("nvidia-smi"))
+print(os.system("nvcc --version"))
+empty_context = np.load("assets/contexts/empty_context.npy")
+print("downloading cc3m-285000.ckpt")
+os.makedirs("assets/ckpts/cc3m-285000.ckpt",exist_ok=True)
+os.system("ls")
+wget.download("https://huggingface.co/nzl-thu/MUSE/resolve/main/assets/ckpts/cc3m-285000.ckpt/lr_scheduler.pth","assets/ckpts/cc3m-285000.ckpt/lr_scheduler.pth")
+wget.download("https://huggingface.co/nzl-thu/MUSE/resolve/main/assets/ckpts/cc3m-285000.ckpt/optimizer.pth","assets/ckpts/cc3m-285000.ckpt/optimizer.pth")
+wget.download("https://huggingface.co/nzl-thu/MUSE/resolve/main/assets/ckpts/cc3m-285000.ckpt/nnet.pth","assets/ckpts/cc3m-285000.ckpt/nnet.pth")
+wget.download("https://huggingface.co/nzl-thu/MUSE/resolve/main/assets/ckpts/cc3m-285000.ckpt/nnet_ema.pth","assets/ckpts/cc3m-285000.ckpt/nnet_ema.pth")
+wget.download("https://huggingface.co/nzl-thu/MUSE/resolve/main/assets/ckpts/cc3m-285000.ckpt/step.pth","assets/ckpts/cc3m-285000.ckpt/step.pth")
+wget.download("https://huggingface.co/zideliu/vqgan/resolve/main/vqgan_jax_strongaug.ckpt","assets/vqgan_jax_strongaug.ckpt")
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (8, 16, 16)
+    config.autoencoder = d(
+        config_file='vq-f16-jax.yaml',
+    )
+    config.resume_root="assets/ckpts/cc3m-285000.ckpt"
+    config.adapter_path=None
+    config.optimizer = d(
+            name='adamw',
+            lr=0.0002,
+            weight_decay=0.03,
+            betas=(0.99, 0.99),
+    )
+    config.lr_scheduler = d(
+            name='customized',
+            warmup_steps=5000
+    )
+    config.nnet = d(
+        name='uvit_t2i_vq',
+        img_size=16,
+        codebook_size=1024,
+        in_chans=4,
+        embed_dim=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        clip_dim=1280,
+        num_clip_token=77,
+        use_checkpoint=True,
+        skip=True,
+        d_prj=32,
+        is_shared=False
+    )
+    config.muse = d(
+        ignore_ind=-1,
+        smoothing=0.1,
+        gen_temp=4.5
+    )
+    config.sample = d(
+        sample_steps=36,
+        n_samples=50,
+        mini_batch_size=8,
+        cfg=True,
+        linear_inc_scale=True,
+        scale=10.,
+        path='',
+        lambdaA=2.0, # Stage I: 2.0; Stage II: TODO
+        lambdaB=5.0, # Stage I: 5.0; Stage II: TODO
+    )
+    return config
+def cfg_nnet(x, context, scale=None,lambdaA=None,lambdaB=None):
+    _cond = nnet_ema(x, context=context)
+    _cond_w_adapter = nnet_ema(x,context=context,use_adapter=True)
+    _empty_context = torch.tensor(empty_context, device=device)
+    _empty_context = einops.repeat(_empty_context, 'L D -> B L D', B=x.size(0))
+    _uncond = nnet_ema(x, context=_empty_context)
+    res = _cond + scale * (_cond - _uncond)
+    if lambdaA is not None:
+        res = _cond_w_adapter + lambdaA*(_cond_w_adapter - _cond) + lambdaB*(_cond - _uncond)
+    return res
+def unprocess(x):
+    x.clamp_(0., 1.)
+    return x
+config = get_config()
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# Load open_clip and vq model
+prompt_model,_,_ = open_clip.create_model_and_transforms('ViT-bigG-14', 'laion2b_s39b_b160k')
+prompt_model = prompt_model.to(device)
+prompt_model.eval()
+tokenizer = open_clip.get_tokenizer('ViT-bigG-14')
+vq_model = taming.models.vqgan.get_model('vq-f16-jax.yaml')
+vq_model.eval()
+vq_model.requires_grad_(False)
+vq_model.to(device)
+## config
+muse = MUSE(codebook_size=vq_model.n_embed, device=device, **config.muse)
+train_state = utils.initialize_train_state(config, device)
+train_state.resume(ckpt_root=config.resume_root)
+nnet_ema = train_state.nnet_ema
+nnet_ema.eval()
+nnet_ema.requires_grad_(False)
+nnet_ema.to(device)
+style_ref = {
+    "None":None,
+    "0102":"style_adapter/0102.pth",
+    "0103":"style_adapter/0103.pth",
+    "0106":"style_adapter/0106.pth",
+    "0108":"style_adapter/0108.pth",
+    "0301":"style_adapter/0301.pth",
+    "0305":"style_adapter/0305.pth",
+    }
+style_postfix ={
+    "None":"",
+    "0102":" in watercolor painting style",
+    "0103":" in watercolor painting style",
+    "0106":" in line drawing style",
+    "0108":" in oil painting style",
+    "0301":" in 3d rendering style",
+    "0305":" in kid crayon drawing style",
+}
+def decode(_batch):
+    return vq_model.decode_code(_batch)
+def process(prompt,num_samples,lambdaA,lambdaB,style,seed,sample_steps,image=None):
+    config.sample.lambdaA = lambdaA
+    config.sample.lambdaB = lambdaB
+    config.sample.sample_steps = sample_steps
+    print(style)
+    adapter_path = style_ref[style]
+    adapter_postfix = style_postfix[style]
+    print(f"load adapter path: {adapter_path}")
+    if adapter_path is not None:
+        nnet_ema.adapter.load_state_dict(torch.load(adapter_path))
+    else:
+        config.sample.lambdaA=None
+        config.sample.lambdaB=None
+    print("load adapter Done!")
+    # Encode prompt
+    prompt = prompt+adapter_postfix
+    text_tokens = tokenizer(prompt).to(device)
+    text_embedding = prompt_model.encode_text(text_tokens)
+    text_embedding = text_embedding.repeat(num_samples, 1, 1) # B 77 1280
+    print(text_embedding.shape)
+    print(f"lambdaA: {lambdaA}, lambdaB: {lambdaB}, sample_steps: {sample_steps}")
+    if seed==-1:
+        seed = random.randint(0,65535)
+    config.seed = seed
+    print(f"seed: {seed}")
+    set_seed(config.seed)
+    res = muse.generate(config,num_samples,cfg_nnet,decode,is_eval=True,context=text_embedding)
+    print(res.shape)
+    res = (res*255+0.5).clamp_(0,255).permute(0,2,3,1).to('cpu',torch.uint8).numpy()
+    im = [res[i] for i in range(num_samples)]
+    return im
+block = gr.Blocks()
+with block:
+    with gr.Row():
+        gr.Markdown("## StyleDrop based on Muse (Inference Only) ")
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(label="Run")
+            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
+            seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=1234)
+            style = gr.Radio(choices=["0102","0103","0106","0108","0305","None"],type="value",value="None",label="Style")
+            with gr.Accordion("Advanced options",open=False):
+                lambdaA = gr.Slider(label="lambdaA", minimum=0.0, maximum=5.0, value=2.0, step=0.01)
+                lambdaB = gr.Slider(label="lambdaB", minimum=0.0, maximum=10.0, value=5.0, step=0.01)
+                sample_steps = gr.Slider(label="Sample steps", minimum=1, maximum=50, value=36, step=1)
+                image=gr.Image(value=None)
+        with gr.Column():
+            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(columns=2, height='auto')
+    with gr.Row():
+        examples = [
+            [
+                "A banana on the table",
+                1,2.0,5.0,"0103",1234,36,
+                "data/image_01_03.jpg",
+            ],
+            [
+                "A cow",
+                1,2.0,5.0,"0102",1234,36,
+                "data/image_01_02.jpg",
+            ],
+            [
+                "A portrait of tabby cat",
+                1,2.0,5.0,"0106",1234,36,
+                "data/image_01_06.jpg",
+            ],
+            [
+                "A church in the field",
+                1,2.0,5.0,"0108",1234,36,
+                 "data/image_01_08.jpg",
+            ],
+            [
+                "A Christmas tree",
+                1,2.0,5.0,"0305",1234,36,
+                "data/image_03_05.jpg",
+            ]
+        ]
+        gr.Examples(examples=examples,
+                        fn=process,
+                        inputs=[
+                            prompt,
+                            num_samples,lambdaA,lambdaB,style,seed,sample_steps,image,
+                        ],
+                        outputs=result_gallery,
+                        cache_examples=os.getenv('SYSTEM') == 'spaces'
+                        )
+    ips = [prompt,num_samples,lambdaA,lambdaB,style,seed,sample_steps,image]
+    run_button.click(
+        fn=process,
+        inputs=ips,
+        outputs=[result_gallery]
+    )
+block.queue().launch(share=False)

assets/contexts/empty_context.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf06c46310efa57d47e34e5221ffa757dc6c60e91c8758fcb1d19040ee61e9fc
+size 394368

assets/fid_stats/fid_stats_cc3m_val.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84605eaad681c8fdb13c5f96f9bcc7a7d8648e4e03023f2498aec7deb3ea3179
+size 33571316

assets/fid_stats/fid_stats_imagenet256_guided_diffusion.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:374aa549982adbfd595eaecc8a014eea6566156f8b227fc2d9052c0482bb4a2f
+size 33571316

assets/pipeline.png ADDED Viewed

configs/cc3m_xl_vqf16_jax_2048bs_featset_CLIP_G.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import ml_collections
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (8, 16, 16)
+    config.autoencoder = d(
+        config_file='vq-f16-jax.yaml',
+    )
+    config.train = d(
+        n_steps=999999999,
+        batch_size=2048,
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=5000,
+        fid_interval=50000,
+        num_workers=8,
+        resampled=False,
+    )
+    config.eval = d(
+        n_samples=10000,
+        sample_steps=18,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.0002,
+        weight_decay=0.03,
+        betas=(0.99, 0.99),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    config.nnet = d(
+        name='uvit_t2i_vq',
+        img_size=16,
+        codebook_size=1024,
+        in_chans=4,
+        embed_dim=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        clip_dim=1280,
+        num_clip_token=77,
+        use_checkpoint=True,
+        skip=True,
+    )
+    config.muse = d(
+        ignore_ind=-1,
+        smoothing=0.1,
+        gen_temp=4.5
+    )
+    config.dataset = d(
+        name='cc3m_web',
+        cfg=True,
+        p_uncond=0.15,
+    )
+    config.wds = d(
+        train_data='assets/datasets/cc3m/vq_f16_jax_clipG_cc3m_train_emb/{00000..03044}.tar',
+        val_data='assets/datasets/cc3m/vq_f16_jax_clipG_cc3m_val_emb/{00000..00012}.tar',
+        ctx_path='assets/contexts',
+        dist_eval=True,
+    )
+    config.sample = d(
+        sample_steps=18,
+        n_samples=30000,
+        mini_batch_size=2,
+        cfg=True,
+        linear_inc_scale=True,
+        scale=10.,
+        path='',
+    )
+    return config

configs/custom.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import ml_collections
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (8, 16, 16)
+    config.autoencoder = d(
+        config_file='vq-f16-jax.yaml',
+    )
+    config.data_path="data/one_style.json"
+    config.resume_root="assets/ckpts/cc3m-285000.ckpt"
+    config.adapter_path=None
+    config.sample_interval=True
+    config.train = d(
+        n_steps=1000,
+        batch_size=8,
+        log_interval=20,
+        eval_interval=100,
+        save_interval=100,
+        fid_interval=20000,
+        num_workers=8,
+        resampled=False,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.0003,
+        weight_decay=0.03,
+        betas=(0.99, 0.99),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=-1, # 5000
+    )
+    config.nnet = d(
+        name='uvit_t2i_vq',
+        img_size=16,
+        codebook_size=1024,
+        in_chans=4,
+        embed_dim=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        clip_dim=1280,
+        num_clip_token=77,
+        use_checkpoint=False,
+        skip=True,
+        d_prj=32,# Stage I: 32; Stage II: TODO
+        is_shared=False, # Stage I: False; Stage II: False
+    )
+    config.muse = d(
+        ignore_ind=-1,
+        smoothing=0.1,
+        gen_temp=4.5
+    )
+    config.sample = d(
+        sample_steps=36,
+        n_samples=50,
+        mini_batch_size=8,
+        cfg=True,
+        linear_inc_scale=True,
+        scale=10.,
+        path='',
+        lambdaA=2.0, # Stage I: 2.0; Stage II: TODO
+        lambdaB=5.0, # Stage I: 5.0; Stage II: TODO
+    )
+    return config

configs/imagenet256_base_vq_jax.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import ml_collections
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.z_shape = (8, 16, 16)
+    config.autoencoder = d(
+        config_file='vq-f16-jax.yaml',
+    )
+    config.train = d(
+        n_steps=99999999,
+        batch_size=2048,
+        log_interval=10,
+        eval_interval=5000,
+        save_interval=5000,
+        fid_interval=50000,
+    )
+    config.eval = d(
+        n_samples=10000,
+        sample_steps=12,
+    )
+    config.optimizer = d(
+        name='adamw',
+        lr=0.0004,
+        weight_decay=0.03,
+        betas=(0.99, 0.99),
+    )
+    config.lr_scheduler = d(
+        name='customized',
+        warmup_steps=5000
+    )
+    config.nnet = d(
+        name='uvit_vq',
+        img_size=16,
+        codebook_size=1024,
+        in_chans=256,
+        patch_size=1,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        num_classes=1001,
+        use_checkpoint=False,
+        skip=True,
+    )
+    config.muse = d(
+        ignore_ind=-1,
+        smoothing=0.1,
+        gen_temp=4.5
+    )
+    config.dataset = d(
+        name='imagenet256_features',
+        path='assets/datasets/imagenet256_vq_features/vq-f16-jax',
+        cfg=True,
+        p_uncond=0.15,
+    )
+    config.sample = d(
+        sample_steps=12,
+        n_samples=50000,
+        mini_batch_size=50,
+        cfg=True,
+        linear_inc_scale=True,
+        scale=3.,
+        path=''
+    )
+    return config

configs/vae_configs/vq-f16-jax.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+model:
+  base_learning_rate: 4.5e-6
+  target: taming.models.vqgan.VQModel
+  params:
+    embed_dim: 256
+    n_embed: 1024
+    ddconfig:
+      double_z: False
+      z_channels: 256
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [16]
+      dropout: 0.0
+    lossconfig:
+      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
+      params:
+        disc_conditional: False
+        disc_in_channels: 3
+        disc_start: 250001
+        disc_weight: 0.8
+        codebook_weight: 1.0
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 24
+    train:
+      target: taming.data.imagenet.ImageNetTrain
+      params:
+        config:
+          size: 256
+    validation:
+      target: taming.data.imagenet.ImageNetValidation
+      params:
+        config:
+          size: 256

custom/custom_dataset.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from torch.utils.data import Dataset
+import os
+import numpy as np
+import taming.models.vqgan
+import open_clip
+import random
+from PIL import Image
+import torch
+import math
+import json
+import torchvision.transforms as transforms
+torch.manual_seed(0)
+np.random.seed(0)
+class test_custom_dataset(Dataset):
+    def __init__(self, style: str = None):
+        self.empty_context = np.load("assets/contexts/empty_context.npy")
+        self.object=[
+            "A chihuahua ",
+            "A tabby cat ",
+            "A portrait of chihuahua ",
+            "An apple on the table ",
+            "A banana on the table ",
+            "A church on the street ",
+            "A church in the mountain ",
+            "A church in the field ",
+            "A church on the beach ",
+            "A chihuahua walking on the street ",
+            "A tabby cat walking on the street",
+            "A portrait of tabby cat ",
+            "An apple on the dish ",
+            "A banana on the dish ",
+            "A human walking on the street ",
+            "A temple on the street ",
+            "A temple in the mountain ",
+            "A temple in the field ",
+            "A temple on the beach ",
+            "A chihuahua walking in the forest ",
+            "A tabby cat walking in the forest ",
+            "A portrait of human face ",
+            "An apple on the ground ",
+            "A banana on the ground ",
+            "A human walking in the forest ",
+            "A cabin on the street ",
+            "A cabin in the mountain ",
+            "A cabin in the field ",
+            "A cabin on the beach ",
+        ]
+        self.style = [
+            "in 3d rendering style",
+        ]
+        if style is not None:
+            self.style = [style]
+    def __getitem__(self, index):
+        prompt = self.object[index]+self.style[0]
+        return prompt, prompt
+    def __len__(self):
+        return len(self.object)
+    def unpreprocess(self, v):  # to B C H W and [0, 1]
+        v.clamp_(0., 1.)
+        return v
+    @property
+    def fid_stat(self):
+        return f'assets/fid_stats/fid_stats_cc3m_val.npz'
+class train_custom_dataset(Dataset):
+    def __init__(self, train_file: str=None, ):
+        self.train_img = json.load(open(train_file, 'r'))
+        self.path_preffix = "/".join(train_file.split("/")[:-1])
+        self.prompt = []
+        self.image = []
+        self.style = []
+        for im in self.train_img.keys():
+            im_path = os.path.join(self.path_preffix, im)
+            self.object = self.train_img[im][0]
+            self.style = self.train_img[im][1]
+            im_prompt = self.object +" "+self.style
+            self.image.append(im_path)
+            self.prompt.append(im_prompt)
+        self.empty_context = np.load("assets/contexts/empty_context.npy")
+        self.transform = transforms.Compose([
+            transforms.Resize((256, 256)),
+            transforms.RandomHorizontalFlip(),
+            # transforms.RandomVerticalFlip(),
+            transforms.ToTensor(),
+        ])
+        print("-----------------"*3)
+        print("train dataset length: ", len(self.prompt))
+        print("train dataset length: ", len(self.image))
+        print(self.prompt[0])
+        print(self.image[0])
+        print("-----------------"*3)
+    def __getitem__(self, index):
+        prompt = self.prompt[0]
+        image = Image.open(self.image[0]).convert("RGB")
+        image = self.transform(image)
+        return image,prompt
+        # return dict(img=image_embedding, text=text_embedding)
+    def __len__(self):
+        return 24
+    def unpreprocess(self, v):  # to B C H W and [0, 1]
+        v.clamp_(0., 1.)
+        return v
+    @property
+    def fid_stat(self):
+        return f'assets/fid_stats/fid_stats_cc3m_val.npz'
+class  Discriptor(Dataset):
+    def __init__(self,style: str=None):
+        self.object =[
+            # "A parrot ",
+            # "A bird ",
+            # "A chihuahua in the snow",
+            # "A towel ",
+            # "A number '1' ",
+            # "A number '2' ",
+            # "A number '3' ",
+            # "A number '6' ",
+            # "A letter 'L' ",
+            # "A letter 'Z' ",
+            # "A letter 'D' ",
+            # "A rabbit ",
+            # "A train ",
+            # "A table ",
+            # "A dish ",
+            # "A large boat ",
+            # "A puppy ",
+            # "A cup ",
+            # "A watermelon ",
+            # "An apple ",
+            # "A banana ",
+            # "A chair ",
+            # "A Welsh Corgi ",
+            # "A cat ",
+            # "A house ",
+            # "A flower ",
+            # "A sunflower ",
+            # "A car ",
+            # "A jeep car ",
+            # "A truck ",
+            # "A Posche car ",
+            # "A vase ",
+            # "A chihuahua ",
+            # "A tabby cat ",
+            "A portrait of chihuahua ",
+            "An apple on the table ",
+            "A banana on the table ",
+            "A human ",
+            "A church on the street ",
+            "A church in the mountain ",
+            "A church in the field ",
+            "A church on the beach ",
+            "A chihuahua walking on the street ",
+            "A tabby cat walking on the street",
+            "A portrait of tabby cat ",
+            "An apple on the dish ",
+            "A banana on the dish ",
+            "A human walking on the street ",
+            "A temple on the street ",
+            "A temple in the mountain ",
+            "A temple in the field ",
+            "A temple on the beach ",
+            "A chihuahua walking in the forest ",
+            "A tabby cat walking in the forest ",
+            "A portrait of human face ",
+            "An apple on the ground ",
+            "A banana on the ground ",
+            "A human walking in the forest ",
+            "A cabin on the street ",
+            "A cabin in the mountain ",
+            "A cabin in the field ",
+            "A cabin on the beach ",
+            "A letter 'A' ",
+            "A letter 'B' ",
+            "A letter 'C' ",
+            "A letter 'D' ",
+            "A letter 'E' ",
+            "A letter 'F' ",
+            "A letter 'G' ",
+            "A butterfly ",
+            " A baby penguin ",
+            "A bench ",
+            "A boat ",
+            "A cow ",
+            "A hat ",
+            "A piano ",
+            "A robot ",
+            "A christmas tree ",
+            "A dog ",
+            "A moose ",
+        ]
+        self.style =[
+            "in 3d rendering style",
+        ]
+        if style is not None:
+            self.style = [style]
+    def __getitem__(self, index):
+        prompt = self.object[index]+self.style[0]
+        return prompt
+    def __len__(self):
+        return len(self.object)
+    def unpreprocess(self, v):  # to B C H W and [0, 1]
+        v.clamp_(0., 1.)
+        return v
+    @property
+    def fid_stat(self):
+        return f'assets/fid_stats/fid_stats_cc3m_val.npz'

data/data.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "image_01_01.jpg":["A bay","in watercolor painting style"],
+    "image_01_02.jpg":["A house", "in watercolor painting style"],
+    "image_01_03.jpg":["A cat", "in watercolor painting style"],
+    "image_01_04.jpg":["Flowers", "in watercolor painting style"],
+    "image_01_05.jpg":["A village", "in oil painting style"],
+    "image_01_06.jpg":["A village", "in line drawing style"],
+    "image_01_07.jpg":["A portrait of a person", "in oil painting style"],
+    "image_01_08.jpg":["A portrait of a person wearing a hat", "in oil painting style"],
+    "image_02_01.jpg":["A person drwoning into th phone", "in cartoon line drawing style"],
+    "image_02_02.jpg":["A woman walking a dog", "in flat cartoon illustration style"],
+    "image_02_03.jpg":["A woman working on a laptop", "in flat cartoon illustration style"],
+    "image_02_04.jpg":["A Christmas tree", "in sticker style"],
+    "image_02_05.jpg":["A wave", "in abstract rainbow colored flowing smoke wave design"],
+    "image_02_06.jpg":["A mushroom", "in glowing style"],
+    "image_03_01.jpg":["Slice of watermelon and clouds in the background", "in 3d rendering style"],
+    "image_03_03.jpg":["A thumbs up", "in glowing 3d rendering style"],
+    "image_03_04.jpg":["A woman", "in 3d rendering style"],
+    "image_03_05.jpg":["A bear", "in kid crayon drawing style"],
+    "image_03_07.jpg":["A flower", "in melting golden 3d rendering style"],
+    "image_03_08.jpg":["A Viking face with beard", "in wooden sculpture"]
+}

data/image_01_01.jpg ADDED Viewed

Git LFS Details

SHA256: 7b467d766af07216c77d933abfbd8fbf97efc69604f6d98f57da207609f5322b
Pointer size: 131 Bytes
Size of remote file: 119 kB

data/image_01_02.jpg ADDED Viewed

Git LFS Details

SHA256: 426033b83f52843be0552d4b94453ad07141b29c7f21e0555ec9e3304d73e8ad
Pointer size: 131 Bytes
Size of remote file: 177 kB

data/image_01_03.jpg ADDED Viewed

Git LFS Details

SHA256: 2335c5df1ee92c60229fb5198ba0ceb02dc157fb4c3aaa3e191466577cc80eae
Pointer size: 131 Bytes
Size of remote file: 663 kB

data/image_01_04.jpg ADDED Viewed

Git LFS Details

SHA256: 92a4544523e35cbe5a23b67820f2e6257c5703d8edced66a584b002ec1865c02
Pointer size: 130 Bytes
Size of remote file: 35.1 kB

data/image_01_05.jpg ADDED Viewed

Git LFS Details

SHA256: 4d06b8a46a2878a25573c618f912929beffc0441f5a8d3f2e9ac3ae3217df94f
Pointer size: 131 Bytes
Size of remote file: 251 kB

data/image_01_06.jpg ADDED Viewed

Git LFS Details

SHA256: d02c652a5836154ceab17aec342dea76d06c4f6a23c964c45244426bf87fd0af
Pointer size: 131 Bytes
Size of remote file: 158 kB

data/image_01_07.jpg ADDED Viewed

Git LFS Details

SHA256: 688a5e48e1208de644f2163a2b44d46a54b1ce3627407bebcf1f389c58a34c46
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

data/image_01_08.jpg ADDED Viewed

Git LFS Details

SHA256: 47632bf3a07a6c7630d032d64371ae58fb08469900eff849ae52b256948b6930
Pointer size: 131 Bytes
Size of remote file: 626 kB

data/image_02_01.jpg ADDED Viewed

Git LFS Details

SHA256: 3e3550da99d36ec1568f313c45401a72a17c42ac32801a2c507ff7d85d874716
Pointer size: 130 Bytes
Size of remote file: 71.9 kB

data/image_02_02.jpg ADDED Viewed

Git LFS Details

SHA256: 9768bcda5ec0953f20a954542232d0a0d630e681ffe96c92d05d49d2f8a22183
Pointer size: 131 Bytes
Size of remote file: 465 kB

data/image_02_03.jpg ADDED Viewed

Git LFS Details

SHA256: f07fe073d140d6dc2d4af9609ba73ba4750f46aa2304d2ffc171989d8c4fba78
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

data/image_02_04.jpg ADDED Viewed

Git LFS Details

SHA256: 57e5dcf39366c4da8727fff4c48214151b4d427033402f28e91e1a5e5384eeb8
Pointer size: 131 Bytes
Size of remote file: 481 kB

data/image_02_05.jpg ADDED Viewed

Git LFS Details

SHA256: 42c439155b17df9bab951a56f9e88e46c7c0109d345fc07553f62d7ccefbbc05
Pointer size: 130 Bytes
Size of remote file: 65.8 kB

data/image_02_06.jpg ADDED Viewed

Git LFS Details

SHA256: efb5a021a7fb5fdcb6e6ed7f8aa282e6a9ae50177a9d8199f82bba748f54d172
Pointer size: 131 Bytes
Size of remote file: 176 kB

data/image_03_01.jpg ADDED Viewed

Git LFS Details

SHA256: b490adc5a556bd5d2f68ef3a28d0ca85fbc8b0d04212df2f19d8a10001eb09a8
Pointer size: 131 Bytes
Size of remote file: 140 kB

data/image_03_03.jpg ADDED Viewed

Git LFS Details

SHA256: a1cdc7fa8d2c8ac873140c4b9c06d0df911063a9a8535d429ad0ddd50e8e7175
Pointer size: 131 Bytes
Size of remote file: 124 kB

data/image_03_04.jpg ADDED Viewed

Git LFS Details

SHA256: d1bce51718f7a09b4e647df9a0e95f19ec2a18678c6d1f057a798828365a4c64
Pointer size: 131 Bytes
Size of remote file: 213 kB

data/image_03_05.jpg ADDED Viewed

Git LFS Details

SHA256: 53e41c6832e722d45170958160ffc4a632da969dc84a98d9fd608620e183825b
Pointer size: 131 Bytes
Size of remote file: 532 kB

data/image_03_07.jpg ADDED Viewed

Git LFS Details

SHA256: d41a949ddb0d7683c27dfd9be52b0dce62f7492a443bc6bdfa4a0e038af949a4
Pointer size: 130 Bytes
Size of remote file: 80 kB

data/image_03_08.jpg ADDED Viewed

Git LFS Details

SHA256: 17c9388900a405ffbd387114965c61b008b235c900393a99feecae4bb02675b5
Pointer size: 131 Bytes
Size of remote file: 419 kB

data/one_style.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "image_01_02.jpg":["A house", "in watercolor painting style"]
+}

libs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # codes from third party

libs/muse.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+import torch
+import math
+from einops import rearrange
+from torch.nn import functional as F
+def add_gumbel_noise(t, temperature, device):
+    return (t + torch.Tensor(temperature * np.random.gumbel(size=t.shape)).to(device))
+class MUSE(object):
+    def __init__(self, codebook_size, device, ignore_ind=-1, smoothing=0., gen_temp=4.5):
+        self.mask_ind = codebook_size  # for input masking
+        self.ignore_ind = ignore_ind  # for ce loss, excluding visible
+        self.device = device
+        self.smoothing = smoothing
+        self.gen_temp = gen_temp
+    @staticmethod
+    def cosine_schedule(t):
+        return torch.cos(t * math.pi * 0.5)
+    def sample(self, x0):
+        N, L, device = *x0.shape, self.device
+        timesteps = torch.zeros((N,), device=device).float().uniform_(0, 1)
+        rand_mask_probs = self.cosine_schedule(timesteps)  # cosine schedule
+        num_token_masked = (L * rand_mask_probs).round().clamp(min=1)
+        batch_randperm = torch.rand(N, L, device=device).argsort(dim=-1)
+        mask = batch_randperm < rearrange(num_token_masked, 'b -> b 1')
+        masked_ids = torch.where(mask, self.mask_ind, x0)
+        labels = torch.where(mask, x0, self.ignore_ind)
+        return labels, masked_ids
+    def loss(self, pred, label):
+        return F.cross_entropy(pred.transpose(1, 2), label.long(),
+                               ignore_index=self.ignore_ind, label_smoothing=self.smoothing)
+    @torch.no_grad()
+    def generate(self, config, _n_samples, nnet, decode_fn, is_eval=False, **kwargs):
+        fmap_size, _sample_steps, device = config.z_shape[-1], config.sample.sample_steps, self.device
+        seq_len = fmap_size ** 2
+        ids = torch.full((_n_samples, seq_len), self.mask_ind, dtype=torch.long, device=device)
+        cfg_scale = 0.
+        for step in range(_sample_steps):
+            ratio = 1. * (step + 1) / _sample_steps
+            annealed_temp = self.gen_temp * (1 - ratio)
+            is_mask = (ids == self.mask_ind)
+            logits = nnet(ids, **kwargs, scale=cfg_scale)
+            # sampling & scoring
+            sampled_ids = add_gumbel_noise(logits, annealed_temp, device).argmax(dim=-1)
+            sampled_logits = torch.squeeze(
+                torch.gather(logits, dim=-1, index=torch.unsqueeze(sampled_ids, -1)), -1)
+            sampled_ids = torch.where(is_mask, sampled_ids, ids)
+            sampled_logits = torch.where(is_mask, sampled_logits, +np.inf).float()
+            # masking
+            mask_ratio = np.cos(ratio * math.pi * 0.5)
+            mask_len = torch.Tensor([np.floor(seq_len * mask_ratio)]).to(device)
+            mask_len = torch.maximum(torch.Tensor([1]).to(device),
+                                     torch.minimum(torch.sum(is_mask, dim=-1, keepdims=True) - 1,
+                                                   mask_len))[0].squeeze()
+            confidence = add_gumbel_noise(sampled_logits, annealed_temp, device)
+            sorted_confidence, _ = torch.sort(confidence, axis=-1)
+            cut_off = sorted_confidence[:, mask_len.long() - 1:mask_len.long()]
+            masking = (confidence <= cut_off)
+            ids = torch.where(masking, self.mask_ind, sampled_ids)
+            cfg_scale = ratio * config.sample.scale
+        _z1 = rearrange(sampled_ids, 'b (i j) -> b i j', i=fmap_size, j=fmap_size)
+        # with adapter
+        ids = torch.full((_n_samples, seq_len), self.mask_ind, dtype=torch.long, device=device)
+        cfg_scale = 0.
+        lambdaA=0.
+        lambdaB=0.
+        for step in range(_sample_steps):
+            ratio = 1. * (step + 1) / _sample_steps
+            annealed_temp = self.gen_temp * (1 - ratio)
+            is_mask = (ids == self.mask_ind)
+            # 尝试使用 *ratio
+            logits = nnet(ids, **kwargs, scale=cfg_scale,lambdaA=lambdaA,lambdaB=lambdaB)
+            # sampling & scoring
+            sampled_ids = add_gumbel_noise(logits, annealed_temp, device).argmax(dim=-1)
+            sampled_logits = torch.squeeze(
+                torch.gather(logits, dim=-1, index=torch.unsqueeze(sampled_ids, -1)), -1)
+            sampled_ids = torch.where(is_mask, sampled_ids, ids)
+            sampled_logits = torch.where(is_mask, sampled_logits, +np.inf).float()
+            # masking
+            mask_ratio = np.cos(ratio * math.pi * 0.5)
+            mask_len = torch.Tensor([np.floor(seq_len * mask_ratio)]).to(device)
+            mask_len = torch.maximum(torch.Tensor([1]).to(device),
+                                     torch.minimum(torch.sum(is_mask, dim=-1, keepdims=True) - 1,
+                                                   mask_len))[0].squeeze()
+            confidence = add_gumbel_noise(sampled_logits, annealed_temp, device)
+            sorted_confidence, _ = torch.sort(confidence, axis=-1)
+            cut_off = sorted_confidence[:, mask_len.long() - 1:mask_len.long()]
+            masking = (confidence <= cut_off)
+            ids = torch.where(masking, self.mask_ind, sampled_ids)
+            cfg_scale = ratio * config.sample.scale
+            lambdaA = config.sample.lambdaA
+            lambdaB = config.sample.lambdaB
+        _z2 = rearrange(sampled_ids, 'b (i j) -> b i j', i=fmap_size, j=fmap_size)
+        _z = _z2 if is_eval else torch.cat([_z1,_z2],dim=0)
+        out = decode_fn(_z)
+        return out

libs/uvit_t2i_vq.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import torch
+import torch.nn as nn
+import math
+from loguru import logger
+import timm
+from timm.models.layers import trunc_normal_
+from timm.models.vision_transformer import PatchEmbed, Mlp
+assert timm.__version__ == "0.3.2"  # version check
+import einops
+import torch.utils.checkpoint
+import torch.nn.functional as F
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILBLE = True
+    print("xformers available, will use xformers attention")
+except:
+    XFORMERS_IS_AVAILBLE = False
+    print("xformers not available, will use pytorch attention instead")
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, vocab_size, hidden_size, max_position_embeddings, dropout=0.1):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(max_position_embeddings).expand((1, -1)))
+        torch.nn.init.normal_(self.word_embeddings.weight, std=.02)
+        torch.nn.init.normal_(self.position_embeddings.weight, std=.02)
+    def forward(
+            self, input_ids
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+        position_ids = self.position_ids[:, :seq_length]
+        inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class MlmLayer(nn.Module):
+    def __init__(self, feat_emb_dim, word_emb_dim, vocab_size):
+        super().__init__()
+        self.fc = nn.Linear(feat_emb_dim, word_emb_dim)
+        self.gelu = nn.GELU()
+        self.ln = nn.LayerNorm(word_emb_dim)
+        self.bias = nn.Parameter(torch.zeros(1, 1, vocab_size))
+    def forward(self, x, word_embeddings):
+        mlm_hidden = self.fc(x)
+        mlm_hidden = self.gelu(mlm_hidden)
+        mlm_hidden = self.ln(mlm_hidden)
+        word_embeddings = word_embeddings.transpose(0, 1)
+        logits = torch.matmul(mlm_hidden, word_embeddings)
+        logits = logits + self.bias
+        return logits
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        if XFORMERS_IS_AVAILBLE:
+            qkv = self.qkv(x)
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
+        else:
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Adapter(nn.Module):
+    def __init__(self, d_emb:int, d_prj:int,n_layer: int, is_shared: bool):
+        super().__init__()
+        self.D = d_emb
+        self.H = d_prj
+        self.L = n_layer
+        self.is_shared = is_shared
+        if self.is_shared:
+            self.DD = nn.Embedding(self.L,self.H)
+            self.DU = nn.Embedding(self.L,self.D)
+            self.WD = nn.Embedding(1,self.D*self.H)
+            self.WU = nn.Embedding(1,self.H*self.D)
+        else:
+            self.WD = nn.Embedding(self.L,self.D*self.H)
+            self.WU = nn.Embedding(self.L,self.H*self.D)
+        self.activate = nn.GELU()
+        self._init_weights()
+    def _init_weights(self):
+        for p in self.WU.parameters():
+            p.detach().zero_()
+        nn.init.trunc_normal_(self.WD.weight,mean=0,std=0.02)
+        if self.is_shared:
+            nn.init.trunc_normal_(self.DD.weight,mean=0,std=0.02)
+            for p in self.DU.parameters():
+                p.detach().zero_()
+    def forward(self, emb, layer):
+        idx = torch.arange(self.L).to(emb.device)
+        layer = torch.tensor(layer).to(emb.device)
+        if self.is_shared:
+            idx0 = torch.zeros_like(idx).to(emb.device)
+            dd = self.DD(idx).reshape(self.L, 1,self.H)
+            du = self.DU(idx).reshape(self.L, 1,self.D)
+            wd = self.WD(idx0).reshape(self.L, self.D,self.H) + dd
+            wu = self.WU(idx0).reshape(self.L, self.H,self.D) + du
+        else:
+            wd = self.WD(idx).reshape(self.L, self.D,self.H)
+            wu = self.WU(idx).reshape(self.L, self.H,self.D)
+        prj = torch.einsum('...d,dh->...h',emb,wd[layer])
+        prj = self.activate(prj)
+        prj = torch.einsum('...h,hd->...d',prj,wu[layer])
+        return emb + prj
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+        self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, skip=None, adapter=None, layer=None):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, skip, adapter, layer)
+        else:
+            return self._forward(x, skip, adapter, layer)
+    def _forward(self, x, skip=None,adapter=None, layer=None):
+        if self.skip_linear is not None:
+            x = self.skip_linear(torch.cat([x, skip], dim=-1))
+        attn = self.attn(self.norm1(x))
+        if adapter is not None:
+            attn = adapter(attn, layer)
+        x = x + attn
+        x = x + self.mlp(self.norm2(x))
+        return x
+class UViT(nn.Module):
+    def __init__(self, img_size=16, in_chans=8, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, norm_layer=nn.LayerNorm, use_checkpoint=False,
+                 clip_dim=768, num_clip_token=77, skip=True, codebook_size=1024,d_prj=4,is_shared=True):
+        super().__init__()
+        logger.debug(f'codebook size in nnet: {codebook_size}')
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.in_chans = in_chans
+        self.skip = skip
+        self.codebook_size = codebook_size
+        vocab_size = codebook_size + 1
+        self.time_embed = None
+        self.extras = num_clip_token
+        self.num_vis_tokens = int((img_size) ** 2)
+        self.token_emb = BertEmbeddings(vocab_size=vocab_size,
+                                        hidden_size=embed_dim,
+                                        max_position_embeddings=self.num_vis_tokens,
+                                        dropout=0.1)
+        print(f'num vis tokens: {self.num_vis_tokens}')
+        self.context_embed = nn.Linear(clip_dim, embed_dim)
+        self.in_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.mid_block = Block(
+            dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+        self.out_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                norm_layer=norm_layer, skip=skip, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.norm = norm_layer(embed_dim)
+        self.mlm_layer = MlmLayer(feat_emb_dim=embed_dim, word_emb_dim=embed_dim, vocab_size=vocab_size)
+        self.adapter = Adapter(d_emb=embed_dim, d_prj=d_prj, n_layer=depth, is_shared=is_shared)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore # type: ignore
+    def no_weight_decay(self):
+        return {'pos_embed'}
+    def forward(self, masked_ids, context,use_adapter=False):
+        assert len(masked_ids.shape) == 2
+        x = self.token_emb(masked_ids)
+        context_token = self.context_embed(context.type_as(x))
+        x = torch.cat((context_token, x), dim=1)
+        layer=0
+        if self.skip:
+            skips = []
+        for blk in self.in_blocks:
+            # 将adapter放在attention之后
+            x = blk(x,adapter=self.adapter if use_adapter else None,layer=layer)
+            if self.skip:
+                skips.append(x)# type: ignore
+            layer+=1
+        x = self.mid_block(x)
+        for blk in self.out_blocks:
+            if self.skip:
+                x = blk(x, skips.pop(),adapter = self.adapter if use_adapter else None,layer=layer)# type: ignore
+            else:
+                x = blk(x,adapter = self.adapter if use_adapter else None,layer=layer)
+        x = self.norm(x)
+        word_embeddings = self.token_emb.word_embeddings.weight.data.detach()
+        x = self.mlm_layer(x, word_embeddings)
+        x = x[:, self.extras:, :self.codebook_size]
+        return x

libs/uvit_vq.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import torch
+import torch.nn as nn
+import math
+from loguru import logger
+import timm
+from timm.models.layers import trunc_normal_
+from timm.models.vision_transformer import PatchEmbed, Mlp
+assert timm.__version__ == "0.3.2"  # version check
+import einops
+import torch.utils.checkpoint
+import torch.nn.functional as F
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILBLE = True
+except:
+    XFORMERS_IS_AVAILBLE = False
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, vocab_size, hidden_size, max_position_embeddings, dropout=0.1):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(max_position_embeddings).expand((1, -1)))
+        torch.nn.init.normal_(self.word_embeddings.weight, std=.02)
+        torch.nn.init.normal_(self.position_embeddings.weight, std=.02)
+    def forward(
+            self, input_ids
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+        position_ids = self.position_ids[:, :seq_length]
+        inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class MlmLayer(nn.Module):
+    def __init__(self, feat_emb_dim, word_emb_dim, vocab_size):
+        super().__init__()
+        self.fc = nn.Linear(feat_emb_dim, word_emb_dim)
+        self.gelu = nn.GELU()
+        self.ln = nn.LayerNorm(word_emb_dim)
+        self.bias = nn.Parameter(torch.zeros(1, 1, vocab_size))
+    def forward(self, x, word_embeddings):
+        mlm_hidden = self.fc(x)
+        mlm_hidden = self.gelu(mlm_hidden)
+        mlm_hidden = self.ln(mlm_hidden)
+        word_embeddings = word_embeddings.transpose(0, 1)
+        logits = torch.matmul(mlm_hidden, word_embeddings)
+        logits = logits + self.bias
+        return logits
+def patchify(imgs, patch_size):
+    x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
+    return x
+def unpatchify(x, channels=3, flatten=False):
+    patch_size = int((x.shape[2] // channels) ** 0.5)
+    h = w = int(x.shape[1] ** .5)
+    assert h * w == x.shape[1] and patch_size ** 2 * channels == x.shape[2]
+    if flatten:
+        x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B (h p1 w p2) C', h=h, p1=patch_size, p2=patch_size)
+    else:
+        x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size)
+    return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        if XFORMERS_IS_AVAILBLE:
+            qkv = self.qkv(x)
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
+        else:
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+        self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, skip=None):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, skip)
+        else:
+            return self._forward(x, skip)
+    def _forward(self, x, skip=None):
+        if self.skip_linear is not None:
+            x = self.skip_linear(torch.cat([x, skip], dim=-1))
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class UViT(nn.Module):
+    def __init__(self, img_size=16, patch_size=1, in_chans=8, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, norm_layer=nn.LayerNorm, num_classes=-1,
+                 use_checkpoint=False, skip=True, codebook_size=1024):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_classes = num_classes
+        self.in_chans = in_chans
+        self.skip = skip
+        logger.debug(f'codebook size in nnet: {codebook_size}')
+        self.codebook_size = codebook_size
+        if num_classes > 0:
+            self.extras = 1
+            vocab_size = codebook_size + num_classes + 1
+        else:
+            self.extras = 0
+            vocab_size = codebook_size + 1
+        self.token_emb = BertEmbeddings(vocab_size=vocab_size,
+                                        hidden_size=embed_dim,
+                                        max_position_embeddings=int(img_size ** 2) + self.extras,
+                                        dropout=0.1)
+        logger.debug(f'token emb weight shape: {self.token_emb.word_embeddings.weight.shape}')
+        if patch_size != 1:  # downsamp
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim, input_shape='bhwc')
+            logger.debug(f'patch emb weight shape: {self.patch_embed.proj.weight.shape}')
+            self.decoder_pred = nn.Linear(embed_dim, patch_size ** 2 * embed_dim, bias=True)
+        else:
+            self.patch_embed = None
+            self.decoder_pred = None
+        self.in_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.mid_block = Block(
+            dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+        self.out_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                norm_layer=norm_layer, skip=skip, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.norm = norm_layer(embed_dim)
+        self.mlm_layer = MlmLayer(feat_emb_dim=embed_dim, word_emb_dim=embed_dim, vocab_size=vocab_size)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed'}
+    def forward(self, x, context=None):
+        assert len(x.shape) == 2
+        if context is not None:
+            context = context + self.codebook_size + 1  # shift, mask token is self.codebook_size
+            x = torch.cat((context, x), dim=1)
+        x = self.token_emb(x.long())
+        if self.patch_embed is not None:
+            featmap_downsampled = self.patch_embed(
+                x[:, self.extras:].reshape(-1, *self.patch_embed.img_size, self.embed_dim)).reshape(x.shape[0], -1, self.embed_dim)
+            x = torch.cat((x[:, :self.extras], featmap_downsampled), dim=1)
+        if self.skip:
+            skips = []
+        for blk in self.in_blocks:
+            x = blk(x)
+            if self.skip:
+                skips.append(x)
+        x = self.mid_block(x)
+        for blk in self.out_blocks:
+            if self.skip:
+                x = blk(x, skips.pop())
+            else:
+                x = blk(x)
+        x = self.norm(x)
+        if self.decoder_pred is not None:
+            featmap_upsampled = unpatchify(self.decoder_pred(x[:, self.extras:]), self.embed_dim, flatten=True)
+            x = torch.cat((x[:, :self.extras], featmap_upsampled), dim=1)
+        word_embeddings = self.token_emb.word_embeddings.weight.data.detach()
+        x = self.mlm_layer(x, word_embeddings)
+        x = x[:, self.extras:, :self.codebook_size]
+        return x

open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .coca_model import CoCa
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
+from .factory import list_models, add_model_config, get_model_config, load_checkpoint
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss
+from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
+    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
+    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
+from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
+from .tokenizer import SimpleTokenizer, tokenize, decode
+from .transform import image_transform, AugmentationCfg

open_clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

open_clip/coca_model.py ADDED Viewed

	@@ -0,0 +1,458 @@

+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from dataclasses import dataclass
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    MultimodalTransformer,
+)
+from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
+try:
+    from transformers import (
+        BeamSearchScorer,
+        LogitsProcessorList,
+        TopPLogitsWarper,
+        TopKLogitsWarper,
+        RepetitionPenaltyLogitsProcessor,
+        MinLengthLogitsProcessor,
+        MaxLengthCriteria,
+        StoppingCriteriaList
+    )
+    GENERATION_TYPES = {
+        "top_k": TopKLogitsWarper,
+        "top_p": TopPLogitsWarper,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = True
+except ImportError as e:
+    GENERATION_TYPES = {
+        "top_k": None,
+        "top_p": None,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = False
+@dataclass
+class MultimodalCfg(CLIPTextCfg):
+    mlp_ratio: int = 4
+    dim_head: int = 64
+    heads: int = 8
+    n_queries: int = 256
+    attn_pooler_heads: int = 8
+def _build_text_decoder_tower(
+        embed_dim,
+        multimodal_cfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+    decoder = MultimodalTransformer(
+        context_length=multimodal_cfg.context_length,
+        width=multimodal_cfg.width,
+        heads=multimodal_cfg.heads,
+        layers=multimodal_cfg.layers,
+        ls_init_value=multimodal_cfg.ls_init_value,
+        output_dim=embed_dim,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+    )
+    return decoder
+class CoCa(nn.Module):
+    def __init__(
+            self,
+            embed_dim,
+            multimodal_cfg: MultimodalCfg,
+            text_cfg: CLIPTextCfg,
+            vision_cfg: CLIPVisionCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            pad_id: int = 0,
+    ):
+        super().__init__()
+        multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+        text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+        vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
+        self.text = _build_text_tower(
+            embed_dim=embed_dim,
+            text_cfg=text_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        vocab_size = (
+            text_cfg.vocab_size  # for hf models
+            if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
+            else text_cfg.vocab_size
+        )
+        self.visual = _build_vision_tower(
+            embed_dim=embed_dim,
+            vision_cfg=vision_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        self.text_decoder = _build_text_decoder_tower(
+            vocab_size,
+            multimodal_cfg=multimodal_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.pad_id = pad_id
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+        self.text_decoder.set_grad_checkpointing(enable)
+    def _encode_image(self, images, normalize=True):
+        image_latent, tokens_embs = self.visual(images)
+        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
+        return image_latent, tokens_embs
+    def _encode_text(self, text, normalize=True, embed_cls=True):
+        text = text[:, :-1] if embed_cls else text # make space for CLS token
+        text_latent, token_emb = self.text(text)
+        text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
+        return text_latent, token_emb
+    def encode_image(self, images, normalize=True):
+        image_latent, _ = self._encode_image(images, normalize=normalize)
+        return image_latent
+    def encode_text(self, text, normalize=True, embed_cls=True):
+        text_latent, _ = self._encode_text(text, normalize=normalize, embed_cls=embed_cls)
+        return text_latent
+    def forward(self, image, text, embed_cls=True, image_latent=None, image_embs=None):
+        text_latent, token_embs = self._encode_text(text, embed_cls=embed_cls)
+        if image_latent is None or image_embs is None:
+            image_latent, image_embs = self._encode_image(image)
+        # TODO: add assertion to avoid bugs?
+        labels = text[:, -token_embs.shape[1]:]
+        logits = self.text_decoder(image_embs, token_embs)
+        return {
+            "image_features": image_latent,
+            "text_features": text_latent,
+            "logits": logits,
+            "labels": labels,
+            "logit_scale": self.logit_scale.exp()
+        }
+    def generate(
+        self,
+        image,
+        text=None,
+        seq_len=30,
+        max_seq_len=77,
+        temperature=1.,
+        generation_type="beam_search",
+        top_p=0.1,  # keep tokens in the 1 - top_p quantile
+        top_k=1,  # keeps the top_k most probable tokens
+        pad_token_id=None,
+        eos_token_id=None,
+        sot_token_id=None,
+        num_beams=6,
+        num_beam_groups=3,
+        min_seq_len=5,
+        stopping_criteria=None,
+        repetition_penalty=1.0,
+        fixed_output_length=False # if True output.shape == (batch_size, seq_len)
+    ):
+        # taking many ideas and components from HuggingFace GenerationMixin
+        # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
+        assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
+        assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
+        with torch.no_grad():
+            sot_token_id = 49406 if sot_token_id is None else sot_token_id
+            eos_token_id = 49407 if eos_token_id is None else eos_token_id
+            pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
+            logit_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(min_seq_len, eos_token_id),
+                    RepetitionPenaltyLogitsProcessor(repetition_penalty),
+                ]
+            )
+            if stopping_criteria is None:
+                stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
+            stopping_criteria = StoppingCriteriaList(
+                stopping_criteria
+            )
+            device = image.device
+            if generation_type == "beam_search":
+                output = self._generate_beamsearch(
+                    image_inputs = image,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    sot_token_id=sot_token_id,
+                    num_beams=num_beams,
+                    num_beam_groups=num_beam_groups,
+                    min_seq_len=min_seq_len,
+                    stopping_criteria=stopping_criteria,
+                    logit_processor=logit_processor,
+                )
+                if fixed_output_length and output.shape[1] < seq_len:
+                    return torch.cat(
+                        (output, torch.ones(output.shape[0], seq_len-output.shape[1], device=device, dtype=output.dtype) * self.pad_id),
+                        dim=1
+                    )
+                return output
+            elif generation_type == "top_p":
+                logit_warper = GENERATION_TYPES[generation_type](top_p)
+            elif generation_type == "top_k":
+                logit_warper = GENERATION_TYPES[generation_type](top_k)
+            else:
+                raise ValueError(
+                    f"generation_type has to be one of "
+                    f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
+                )
+            image_latent, image_embs = self._encode_image(image)
+            if text is None:
+                text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
+            was_training = self.training
+            num_dims = len(text.shape)
+            if num_dims == 1:
+                text = text[None, :]
+            cur_len = text.shape[1]
+            self.eval()
+            out = text
+            while True:
+                x = out[:, -max_seq_len:]
+                cur_len = x.shape[1]
+                logits = self(image, x, image_latent=image_latent, image_embs=image_embs, embed_cls=False)["logits"][:, -1]
+                mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
+                sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
+                if mask.all():
+                    if not fixed_output_length:
+                        break
+                else:
+                    logits = logits[~mask, :]
+                    filtered_logits = logit_processor(x[~mask, :], logits)
+                    filtered_logits = logit_warper(x[~mask, :], filtered_logits)
+                    probs = F.softmax(filtered_logits / temperature, dim=-1)
+                    if (cur_len + 1 == seq_len):
+                        sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
+                    else:
+                        sample[~mask, :] = torch.multinomial(probs, 1)
+                out = torch.cat((out, sample), dim=-1)
+                cur_len += 1
+                if stopping_criteria(out, None):
+                    break
+            if num_dims == 1:
+                out = out.squeeze(0)
+            self.train(was_training)
+            return out
+    def _generate_beamsearch(
+            self,
+            image_inputs,
+            pad_token_id=None,
+            eos_token_id=None,
+            sot_token_id=None,
+            num_beams=6,
+            num_beam_groups=3,
+            min_seq_len=5,
+            stopping_criteria=None,
+            logit_processor=None,
+            logit_warper=None,
+    ):
+        device = image_inputs.device
+        batch_size = image_inputs.shape[0]
+        image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
+        image_latent, image_embs = self._encode_image(image_inputs)
+        input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
+        input_ids = input_ids * sot_token_id
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=device,
+            num_beam_groups=num_beam_groups,
+        )
+        # instantiate logits processors
+        logits_processor = (
+            LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
+            if logit_processor is None
+            else logit_processor
+        )
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        batch_beam_size, cur_len = input_ids.shape
+        beam_indices = None
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while True:
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
+            outputs = self(
+                model_inputs['images'],
+                model_inputs['text'],
+                embed_cls=False,
+                image_latent=image_latent,
+                image_embs=image_embs
+            )
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+                # select outputs of beams of currentg group only
+                next_token_logits = outputs['logits'][batch_group_indices, -1, :]
+                vocab_size = next_token_logits.shape[-1]
+                next_token_scores_processed = logits_processor(
+                    group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
+                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                )
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=process_beam_indices,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
+                )
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
+            cur_len = cur_len + 1
+            if beam_scorer.is_done or stopping_criteria(input_ids, None):
+                break
+        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=final_beam_indices,
+        )
+        return sequence_outputs['sequences']
+def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+    else:
+        position_ids = None
+    return {
+        "text": input_ids,
+        "images": image_inputs,
+        "past_key_values": past,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+    }

open_clip/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2	+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

open_clip/factory.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import json
+import logging
+import os
+import pathlib
+import re
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
+    resize_pos_embed, get_cast_dtype
+from .coca_model import CoCa
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss
+from .openai import load_openai_model
+from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model, download_pretrained_from_hf
+from .transform import image_transform, AugmentationCfg
+from .tokenizer import HFTokenizer, tokenize
+HF_HUB_PREFIX = 'hf-hub:'
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+    config_ext = ('.json',)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f'*{ext}'))
+    for cf in config_files:
+        with open(cf, 'r') as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+    _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
+_rescan_model_configs()  # initial populate of model config registry
+def list_models():
+    """ enumerate available model architectures based on config files """
+    return list(_MODEL_CONFIGS.keys())
+def add_model_config(path):
+    """ add model config path or file and update registry """
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()
+def get_model_config(model_name):
+    if model_name in _MODEL_CONFIGS:
+        return deepcopy(_MODEL_CONFIGS[model_name])
+    else:
+        return None
+def get_tokenizer(model_name):
+    if model_name.startswith(HF_HUB_PREFIX):
+        tokenizer = HFTokenizer(model_name[len(HF_HUB_PREFIX):])
+    else:
+        config = get_model_config(model_name)
+        tokenizer = HFTokenizer(
+            config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
+    return tokenizer
+def load_state_dict(checkpoint_path: str, map_location='cpu'):
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if next(iter(state_dict.items()))[0].startswith('module'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    return state_dict
+def load_checkpoint(model, checkpoint_path, strict=True):
+    state_dict = load_state_dict(checkpoint_path)
+    # detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    resize_pos_embed(state_dict, model)
+    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    return incompatible_keys
+def create_model(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+):
+    has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
+    if has_hf_hub_prefix:
+        model_id = model_name[len(HF_HUB_PREFIX):]
+        checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
+        config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir)
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+        pretrained_cfg = config['preprocess_cfg']
+        model_cfg = config['model_cfg']
+    else:
+        model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
+        checkpoint_path = None
+        pretrained_cfg = {}
+        model_cfg = None
+    if isinstance(device, str):
+        device = torch.device(device)
+    if pretrained and pretrained.lower() == 'openai':
+        logging.info(f'Loading pretrained {model_name} from OpenAI.')
+        model = load_openai_model(
+            model_name,
+            precision=precision,
+            device=device,
+            jit=jit,
+            cache_dir=cache_dir,
+        )
+        # to always output dict even if it is clip
+        if output_dict and hasattr(model, "output_dict"):
+            model.output_dict = True
+    else:
+        model_cfg = model_cfg or get_model_config(model_name)
+        if model_cfg is not None:
+            logging.info(f'Loaded {model_name} model config.')
+        else:
+            logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
+            raise RuntimeError(f'Model config for {model_name} not found.')
+        if force_quick_gelu:
+            # override for use of QuickGELU on non-OpenAI transformer models
+            model_cfg["quick_gelu"] = True
+        if force_patch_dropout is not None:
+            # override the default patch dropout value
+            model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout
+        if force_image_size is not None:
+            # override model config's image size
+            model_cfg["vision_cfg"]["image_size"] = force_image_size
+        if pretrained_image:
+            if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
+                # pretrained weight loading for timm models set via vision_cfg
+                model_cfg['vision_cfg']['timm_model_pretrained'] = True
+            else:
+                assert False, 'pretrained image towers currently only supported for timm models'
+        cast_dtype = get_cast_dtype(precision)
+        is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
+        custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model
+        if custom_text:
+            if is_hf_model:
+                model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
+            if "coca" in model_name:
+                model = CoCa(**model_cfg, cast_dtype=cast_dtype)
+            else:
+                model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+        else:
+            model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+        pretrained_loaded = False
+        if pretrained:
+            checkpoint_path = ''
+            pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
+            if pretrained_cfg:
+                checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
+            elif os.path.exists(pretrained):
+                checkpoint_path = pretrained
+            if checkpoint_path:
+                logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
+                load_checkpoint(model, checkpoint_path)
+            else:
+                error_str = (
+                    f'Pretrained weights ({pretrained}) not found for model {model_name}.'
+                    f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
+                logging.warning(error_str)
+                raise RuntimeError(error_str)
+            pretrained_loaded = True
+        elif has_hf_hub_prefix:
+            logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
+            load_checkpoint(model, checkpoint_path)
+            pretrained_loaded = True
+        if require_pretrained and not pretrained_loaded:
+            # callers of create_model_from_pretrained always expect pretrained weights
+            raise RuntimeError(
+                f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')
+        model.to(device=device)
+        if precision in ("fp16", "bf16"):
+            convert_weights_to_lp(model, dtype=torch.bfloat16 if precision == 'bf16' else torch.float16)
+        # set image / mean metadata from pretrained_cfg if available, or use default
+        model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
+        model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
+        # to always output dict even if it is clip
+        if output_dict and hasattr(model, "output_dict"):
+            model.output_dict = True
+        if jit:
+            model = torch.jit.script(model)
+    return model
+def create_loss(args):
+    if args.distill:
+        return DistillClipLoss(
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif "coca" in args.model.lower():
+        return CoCaLoss(
+            caption_loss_weight=args.coca_caption_loss_weight,
+            clip_loss_weight=args.coca_contrastive_loss_weight,
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    return ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod,
+    )
+def create_model_and_transforms(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+):
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_patch_dropout=force_patch_dropout,
+        force_image_size=force_image_size,
+        pretrained_image=pretrained_image,
+        pretrained_hf=pretrained_hf,
+        cache_dir=cache_dir,
+        output_dict=output_dict,
+    )
+    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
+    image_std = image_std or getattr(model.visual, 'image_std', None)
+    preprocess_train = image_transform(
+        model.visual.image_size,
+        is_train=True,
+        mean=image_mean,
+        std=image_std,
+        aug_cfg=aug_cfg,
+    )
+    preprocess_val = image_transform(
+        model.visual.image_size,
+        is_train=False,
+        mean=image_mean,
+        std=image_std,
+    )
+    return model, preprocess_train, preprocess_val
+def create_model_from_pretrained(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        return_transform: bool = True,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        cache_dir: Optional[str] = None,
+):
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_image_size=force_image_size,
+        cache_dir=cache_dir,
+        require_pretrained=True,
+    )
+    if not return_transform:
+        return model
+    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
+    image_std = image_std or getattr(model.visual, 'image_std', None)
+    preprocess = image_transform(
+        model.visual.image_size,
+        is_train=False,
+        mean=image_mean,
+        std=image_std,
+    )
+    return model, preprocess

open_clip/generation_utils.py ADDED Viewed

File without changes

open_clip/hf_configs.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# HF architecture dict:
+arch_dict = {
+    # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
+    "roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
+    "xlm-roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
+    "mt5": {
+        "config_names": {
+            # unlimited seqlen
+            # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
+            "context_length": "",
+            "vocab_size": "vocab_size",
+            "width": "d_model",
+            "heads": "num_heads",
+            "layers": "num_layers",
+            "layer_attr": "block",
+            "token_embeddings_attr": "embed_tokens"
+        },
+        "pooler": "mean_pooler",
+    },
+}

open_clip/hf_model.py ADDED Viewed

	@@ -0,0 +1,176 @@

+""" huggingface model adapter
+Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
+"""
+import re
+import torch
+import torch.nn as nn
+from torch import TensorType
+try:
+    import transformers
+    from transformers import AutoModel, AutoTokenizer, AutoConfig, PretrainedConfig
+    from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
+        BaseModelOutputWithPoolingAndCrossAttentions
+except ImportError as e:
+    transformers = None
+    class BaseModelOutput:
+        pass
+    class PretrainedConfig:
+        pass
+from .hf_configs import arch_dict
+# utils
+def _camel2snake(s):
+    return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
+# TODO: ?last - for gpt-like models
+_POOLERS = {}
+def register_pooler(cls):
+    """Decorator registering pooler class"""
+    _POOLERS[_camel2snake(cls.__name__)] = cls
+    return cls
+@register_pooler
+class MeanPooler(nn.Module):
+    """Mean pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
+        return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
+@register_pooler
+class MaxPooler(nn.Module):
+    """Max pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
+        return masked_output.max(1).values
+@register_pooler
+class ClsPooler(nn.Module):
+    """CLS token pooling"""
+    def __init__(self, use_pooler_output=True):
+        super().__init__()
+        self.cls_token_position = 0
+        self.use_pooler_output = use_pooler_output
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        if (self.use_pooler_output and
+            isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
+            (x.pooler_output is not None)
+        ):
+            return x.pooler_output
+        return x.last_hidden_state[:, self.cls_token_position, :]
+class HFTextEncoder(nn.Module):
+    """HuggingFace model adapter"""
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+            self,
+            model_name_or_path: str,
+            output_dim: int,
+            config: PretrainedConfig = None,
+            pooler_type: str = None,
+            proj: str = None,
+            pretrained: bool = True,
+            output_tokens: bool = False,
+    ):
+        super().__init__()
+        self.output_tokens = output_tokens
+        self.output_dim = output_dim
+        # TODO: find better way to get this information
+        uses_transformer_pooler = (pooler_type == "cls_pooler")
+        if transformers is None:
+            raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
+        if config is None:
+            self.config = AutoConfig.from_pretrained(model_name_or_path)
+            create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
+                AutoModel.from_config, self.config)
+            # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
+            if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
+                self.transformer = create_func(model_args)
+                self.transformer = self.transformer.encoder
+            else:
+                self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
+        else:
+            self.config = config
+            self.transformer = AutoModel.from_config(config)
+        if pooler_type is None:  # get default arch pooler
+            pooler_type = (arch_dict[self.config.model_type]["pooler"])
+        self.pooler = _POOLERS[pooler_type]()
+        d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
+        if (d_model == output_dim) and (proj is None):  # do we always need a proj?
+            self.proj = nn.Identity()
+        elif proj == 'linear':
+            self.proj = nn.Linear(d_model, output_dim, bias=False)
+        elif proj == 'mlp':
+            hidden_size = (d_model + output_dim) // 2
+            self.proj = nn.Sequential(
+                nn.Linear(d_model, hidden_size, bias=False),
+                nn.GELU(),
+                nn.Linear(hidden_size, output_dim, bias=False),
+            )
+    def forward(self, x: TensorType):
+        attn_mask = (x != self.config.pad_token_id).long()
+        out = self.transformer(input_ids=x, attention_mask=attn_mask)
+        pooled_out = self.pooler(out, attn_mask)
+        projected = self.proj(pooled_out)
+        seq_len = out.last_hidden_state.shape[1]
+        tokens = (
+            out.last_hidden_state[:, torch.arange(seq_len) != self.pooler.cls_token_position, :]
+            if type(self.pooler) == ClsPooler
+            else out.last_hidden_state
+        )
+        if self.output_tokens:
+            return projected, tokens
+        return projected
+    def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:  # full freezing
+            for n, p in self.transformer.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+            return
+        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
+        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
+        print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
+        embeddings = getattr(
+            self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
+        modules = [embeddings, *layer_list][:-unlocked_layers]
+        # freeze layers
+        for module in modules:
+            for n, p in module.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.gradient_checkpointing_enable()
+    def init_parameters(self):
+        pass

open_clip/loss.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def gather_features(
+        image_features,
+        text_features,
+        local_loss=False,
+        gather_with_grad=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False
+):
+    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        if gather_with_grad:
+            all_image_features = hvd.allgather(image_features)
+            all_text_features = hvd.allgather(text_features)
+        else:
+            with torch.no_grad():
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+        else:
+            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+            all_image_features = torch.cat(gathered_image_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+    return all_image_features, all_text_features
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features, text_features,
+                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class CoCaLoss(ClipLoss):
+    def __init__(
+            self,
+            caption_loss_weight,
+            clip_loss_weight,
+            pad_id=0,  # pad_token for open_clip custom tokenizer
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__(
+            local_loss=local_loss,
+            gather_with_grad=gather_with_grad,
+            cache_labels=cache_labels,
+            rank=rank,
+            world_size=world_size,
+            use_horovod=use_horovod
+        )
+        self.clip_loss_weight = clip_loss_weight
+        self.caption_loss_weight = caption_loss_weight
+        self.caption_loss = nn.CrossEntropyLoss(ignore_index=pad_id)
+    def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
+        clip_loss = super().forward(image_features, text_features, logit_scale)
+        clip_loss = self.clip_loss_weight * clip_loss
+        caption_loss = self.caption_loss(
+            logits.permute(0, 2, 1),
+            labels,
+        )
+        caption_loss = caption_loss * self.caption_loss_weight
+        if output_dict:
+            return {"contrastive_loss": clip_loss, "caption_loss": caption_loss}
+        return clip_loss, caption_loss
+class DistillClipLoss(ClipLoss):
+    def dist_loss(self, teacher_logits, student_logits):
+        return -(teacher_logits.softmax(dim=1) * student_logits.log_softmax(dim=1)).sum(dim=1).mean(dim=0)
+    def forward(
+            self,
+            image_features,
+            text_features,
+            logit_scale,
+            dist_image_features,
+            dist_text_features,
+            dist_logit_scale,
+            output_dict=False,
+    ):
+        logits_per_image, logits_per_text = \
+            self.get_logits(image_features, text_features, logit_scale)
+        dist_logits_per_image, dist_logits_per_text = \
+            self.get_logits(dist_image_features, dist_text_features, dist_logit_scale)
+        labels = self.get_ground_truth(image_features.device, logits_per_image.shape[0])
+        contrastive_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        distill_loss = (
+            self.dist_loss(dist_logits_per_image, logits_per_image) +
+            self.dist_loss(dist_logits_per_text, logits_per_text)
+        ) / 2
+        if output_dict:
+            return {"contrastive_loss": contrastive_loss, "distill_loss": distill_loss}
+        return contrastive_loss, distill_loss

open_clip/model.py ADDED Viewed

	@@ -0,0 +1,445 @@

+""" CLIP Model
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+from dataclasses import dataclass
+import logging
+import math
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from .hf_model import HFTextEncoder
+from .modified_resnet import ModifiedResNet
+from .timm_model import TimmModel
+from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
+from .utils import to_2tuple
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    input_patchnorm: bool = False # whether to use dual patchnorm - would only apply the input layernorm on each patch, as post-layernorm already exist in original clip vit design
+    global_average_pool: bool = False  # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    attentional_pool: bool = False # whether to use attentional pooler in the last embedding layer
+    n_queries: int = 256 # n_queries for attentional pooler
+    attn_pooler_heads: int = 8 # n heads for attentional_pooling
+    timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
+    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
+    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
+    timm_proj_bias: bool = False  # enable bias final projection
+    timm_drop: float = 0.  # head dropout
+    timm_drop_path: Optional[float] = None  # backbone stochastic depth
+    output_tokens: bool = False
+@dataclass
+class CLIPTextCfg:
+    context_length: int = 77
+    vocab_size: int = 49408
+    width: int = 512
+    heads: int = 8
+    layers: int = 12
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    hf_model_name: str = None
+    hf_tokenizer_name: str = None
+    hf_model_pretrained: bool = True
+    proj: str = 'mlp'
+    pooler_type: str = 'mean_pooler'
+    embed_cls: bool = False
+    pad_id: int = 0
+    output_tokens: bool = False
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == 'bf16':
+        cast_dtype = torch.bfloat16
+    elif precision == 'fp16':
+        cast_dtype = torch.float16
+    return cast_dtype
+def _build_vision_tower(
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None
+):
+    if isinstance(vision_cfg, dict):
+        vision_cfg = CLIPVisionCfg(**vision_cfg)
+    # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
+    # memory efficient in recent PyTorch releases (>= 1.10).
+    # NOTE: timm models always use native GELU regardless of quick_gelu flag.
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    if vision_cfg.timm_model_name:
+        visual = TimmModel(
+            vision_cfg.timm_model_name,
+            pretrained=vision_cfg.timm_model_pretrained,
+            pool=vision_cfg.timm_pool,
+            proj=vision_cfg.timm_proj,
+            proj_bias=vision_cfg.timm_proj_bias,
+            drop=vision_cfg.timm_drop,
+            drop_path=vision_cfg.timm_drop_path,
+            embed_dim=embed_dim,
+            image_size=vision_cfg.image_size,
+        )
+        act_layer = nn.GELU  # so that text transformer doesn't use QuickGELU w/ timm models
+    elif isinstance(vision_cfg.layers, (tuple, list)):
+        vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
+        visual = ModifiedResNet(
+            layers=vision_cfg.layers,
+            output_dim=embed_dim,
+            heads=vision_heads,
+            image_size=vision_cfg.image_size,
+            width=vision_cfg.width,
+        )
+    else:
+        vision_heads = vision_cfg.width // vision_cfg.head_width
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        visual = VisionTransformer(
+            image_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            width=vision_cfg.width,
+            layers=vision_cfg.layers,
+            heads=vision_heads,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            ls_init_value=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            input_patchnorm=vision_cfg.input_patchnorm,
+            global_average_pool=vision_cfg.global_average_pool,
+            attentional_pool=vision_cfg.attentional_pool,
+            n_queries=vision_cfg.n_queries,
+            attn_pooler_heads=vision_cfg.attn_pooler_heads,
+            output_tokens=vision_cfg.output_tokens,
+            output_dim=embed_dim,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    return visual
+def _build_text_tower(
+        embed_dim: int,
+        text_cfg: CLIPTextCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(text_cfg, dict):
+        text_cfg = CLIPTextCfg(**text_cfg)
+    if text_cfg.hf_model_name:
+        text = HFTextEncoder(
+            text_cfg.hf_model_name,
+            output_dim=embed_dim,
+            proj=text_cfg.proj,
+            pooler_type=text_cfg.pooler_type,
+            pretrained=text_cfg.hf_model_pretrained,
+            output_tokens=text_cfg.output_tokens,
+        )
+    else:
+        act_layer = QuickGELU if quick_gelu else nn.GELU
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        text = TextTransformer(
+            context_length=text_cfg.context_length,
+            vocab_size=text_cfg.vocab_size,
+            width=text_cfg.width,
+            heads=text_cfg.heads,
+            layers=text_cfg.layers,
+            ls_init_value=text_cfg.ls_init_value,
+            output_dim=embed_dim,
+            embed_cls=text_cfg.embed_cls,
+            output_tokens=text_cfg.output_tokens,
+            pad_id=text_cfg.pad_id,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    return text
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.transformer = text.transformer
+        self.vocab_size = text.vocab_size
+        self.token_embedding = text.token_embedding
+        self.positional_embedding = text.positional_embedding
+        self.ln_final = text.ln_final
+        self.text_projection = text.text_projection
+        self.register_buffer('attn_mask', text.attn_mask, persistent=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.transformer.grad_checkpointing = enable
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        cast_dtype = self.transformer.get_cast_dtype()
+        x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.to(cast_dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, attn_mask=self.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return F.normalize(x, dim=-1) if normalize else x
+    def forward(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        if self.output_dict:
+            return {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+        return image_features, text_features, self.logit_scale.exp()
+class CustomTextCLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.text.lock(unlocked_layers, freeze_layer_norm)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        features = self.text(text)
+        return F.normalize(features, dim=-1) if normalize else features
+    def forward(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        if self.output_dict:
+            return {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+        return image_features, text_features, self.logit_scale.exp()
+def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
+    """Convert applicable model parameters to low-precision (bf16 or fp16)"""
+    def _convert_weights(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.to(dtype)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(dtype)
+        if isinstance(l, (nn.MultiheadAttention, Attention)):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.to(dtype)
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.to(dtype)
+    model.apply(_convert_weights)
+convert_weights_to_fp16 = convert_weights_to_lp  # backwards compat
+# used to maintain checkpoint compatibility
+def convert_to_custom_text_state_dict(state_dict: dict):
+    if 'text_projection' in state_dict:
+        # old format state_dict, move text tower -> .text
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if any(k.startswith(p) for p in (
+                'text_projection',
+                'positional_embedding',
+                'token_embedding',
+                'transformer',
+                'ln_final',
+            )):
+                k = 'text.' + k
+            new_state_dict[k] = v
+        return new_state_dict
+    return state_dict
+def build_model_from_openai_state_dict(
+        state_dict: dict,
+        quick_gelu=True,
+        cast_dtype=torch.float16,
+):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len(
+            [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_size = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_size = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    vision_cfg = CLIPVisionCfg(
+        layers=vision_layers,
+        width=vision_width,
+        patch_size=vision_patch_size,
+        image_size=image_size,
+    )
+    text_cfg = CLIPTextCfg(
+        context_length=context_length,
+        vocab_size=vocab_size,
+        width=transformer_width,
+        heads=transformer_heads,
+        layers=transformer_layers,
+    )
+    model = CLIP(
+        embed_dim,
+        vision_cfg=vision_cfg,
+        text_cfg=text_cfg,
+        quick_gelu=quick_gelu,  # OpenAI models were trained with QuickGELU
+        cast_dtype=cast_dtype,
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        state_dict.pop(key, None)
+    convert_weights_to_fp16(model)  # OpenAI state dicts are partially converted to float16
+    model.load_state_dict(state_dict)
+    return model.eval()
+def trace_model(model, batch_size=256, device=torch.device('cpu')):
+    model.eval()
+    image_size = model.visual.image_size
+    example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
+    example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
+    model = torch.jit.trace_module(
+        model,
+        inputs=dict(
+            forward=(example_images, example_text),
+            encode_text=(example_text,),
+            encode_image=(example_images,)
+        ))
+    model.visual.image_size = image_size
+    return model
+def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', antialias: bool = True):
+    # Rescale the grid of position embeddings when loading from state_dict
+    old_pos_embed = state_dict.get('visual.positional_embedding', None)
+    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
+        return
+    grid_size = to_2tuple(model.visual.grid_size)
+    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+    if new_seq_len == old_pos_embed.shape[0]:
+        return
+    if extra_tokens:
+        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+    else:
+        pos_emb_tok, pos_emb_img = None, old_pos_embed
+    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+    pos_emb_img = F.interpolate(
+        pos_emb_img,
+        size=grid_size,
+        mode=interpolation,
+        antialias=antialias,
+        align_corners=False,
+    )
+    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+    if pos_emb_tok is not None:
+        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+    else:
+        new_pos_embed = pos_emb_img
+    state_dict['visual.positional_embedding'] = new_pos_embed