Spaces:

lilkm
/

Text2Cryptopunks

Build error

App Files Files Community

Khalil commited on Sep 12, 2021

Commit

b41a54a

1 Parent(s): 416f940

First commit, add text2punps scripts, app file, and requirements file

Browse files

Files changed (10) hide show

app.py +82 -0
requirements.txt +9 -0
text2punks/attention.py +175 -0
text2punks/data/byte-level-bpe_4k.tokenizer.json +969 -0
text2punks/data/codebook.pt +0 -0
text2punks/loader.py +96 -0
text2punks/text2punk.py +377 -0
text2punks/tokenizer.py +233 -0
text2punks/transformer.py +115 -0
text2punks/utils.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# system
+import os
+os.system("gdown https://drive.google.com/uc?id=1--27E5dk8GzgvpVL0ofr-m631iymBpUH")
+os.system("gdown https://drive.google.com/uc?id=191a5lTsUPQ1hXaeo6kVNbo_W3WYuXsmF")
+# plot
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+# gradio
+import gradio as gr
+# text2punks utils
+from text2punks.utils import to_pil_image, model_loader, generate_image
+batch_size = 32
+num_images = 32
+top_prediction = 8
+# nobs to tune
+top_k = 0.8
+temperature = 1.25
+# helper functions
+def compose_predictions(images):
+    increased_h = 0
+    h, w = images[0].shape[0], images[0].shape[1]
+    image_grid = Image.new("RGB", (len(images)*w, h))
+    for i, img_ in enumerate(images):
+        image_grid.paste(to_pil_image(img_), (i*w, increased_h))
+    return img
+def run_inference(prompt, num_images=32, num_preds=8):
+    t2p_path, clip_path = './Text2Punk-final-7.pt', './clip-final.pt'
+    text2punk, clip = model_loader(t2p_path, clip_path)
+    images = generate_image(prompt_text=prompt, top_k=top_k, temperature=temperature, num_images=num_images, batch_size=batch_size, top_prediction=top_prediction, text2punk_model=text2punk, clip_model=clip)
+    predictions = compose_predictions(images)
+    output_title = f"""
+    <b>{prompt}</b>
+    """
+    return (output_title, predictions)
+outputs = [
+    gr.outputs.HTML(label=""),      # To be used as title
+    gr.outputs.Image(label=''),
+]
+description = """
+Text2Cryptopunks is an AI model that generates Cryptopunks images from text prompt:
+"""
+gr.Interface(run_inference,
+    inputs=[gr.inputs.Textbox(label='type somthing like this : "An Ape CryptoPunk that has 2 Attributes, a Pigtails and a Medical Mask."')],
+    outputs=outputs,
+    title='Text2Cryptopunks',
+    description=description,
+    article="<p style='text-align: center'> Created by kTonpa | <a href='https://github.com/kTonpa/Text2CryptoPunks'>GitHub</a>",
+    layout='vertical',
+    theme='huggingface',
+    examples=[['Cute Alien cryptopunk that has a 2 Attributes, a Pipe, and a Beanie.'], ['A low resolution photo of punky-looking Ape that has 2 Attributes, a Beanie, and a Medical Mask.']],
+    allow_flagging=False,
+    live=False,
+    # server_port=8999
+).launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+torchvision
+einops
+numpy
+ftfy
+regex
+axial-positional-embedding
+youtokentome
+tokenizers

text2punks/attention.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# helpers
+def exists(val):
+    return val is not None
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+# classes
+class Attention(nn.Module):
+    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, attn_dropout = 0., resid_dropout = 0.):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.seq_len = seq_len
+        self.scale = dim_head ** -0.5
+        self.causal = causal
+        self.attn_drop = nn.Dropout(attn_dropout)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(resid_dropout)
+        )
+    def forward(self, x):
+        h, device = self.heads, x.device
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
+        q = q * self.scale
+        dots = torch.einsum('b h i d, b h j d -> b h i j', q, k)
+        mask_value = max_neg_value(dots)
+        if self.causal:
+            i, j = dots.shape[-2:]
+            mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
+            dots.masked_fill_(mask, mask_value)
+        attn = torch.softmax(dots, dim=-1)
+        attn = self.attn_drop(attn)
+        out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        out =  self.to_out(out)
+        return out
+# sparse axial causal attention
+class SparseAxialCausalAttention(nn.Module):
+    def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head = 64, attn_dropout = 0., resid_dropout = 0.):
+        super().__init__()
+        assert axis in {0, 1}, 'axis must be either 0 (along height) or 1 (along width)'
+        self.axis = axis
+        inner_dim = dim_head *  heads
+        self.seq_len = seq_len
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        self.image_size = image_size
+        self.attn_drop = nn.Dropout(attn_dropout)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(resid_dropout)
+        )
+    def forward(self, x):
+        b, n, _, h, img_size, axis, seq_len, device = *x.shape, self.heads, self.image_size, self.axis, self.seq_len, x.device
+        img_seq_len = img_size ** 2
+        text_len = seq_len + 1 - img_seq_len
+        # padding
+        padding = seq_len - n + 1
+        mask = torch.ones(b, text_len, device = device).bool()
+        x = F.pad(x, (0, 0, 0, padding), value = 0)
+        mask = mask[:, :text_len]
+        # derive queries / keys / values
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)
+        # print(self.scale)
+        q = q * self.scale
+        ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v))
+        # text attention
+        dots_text = einsum('b i d, b j d -> b i j', q_text, k_text)
+        mask_value = max_neg_value(dots_text)
+        i, j = dots_text.shape[-2:]
+        text_causal_mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
+        dots_text.masked_fill_(text_causal_mask, mask_value)
+        attn_text = torch.softmax(dots_text, dim = -1)
+        # attention dropout
+        attn_text = self.attn_drop(attn_text)
+        out_text = einsum('b i j, b j d -> b i d', attn_text, v_text)
+        # image attention
+        split_axis_einops = 'b (h w) c -> b h w c' if axis == 0 else 'b (h w) c -> b w h c'
+        merge_axis_einops = 'b x n d -> b (x n) d' if axis == 0 else 'b x n d -> b (n x) d'
+        # split out axis
+        q_img, k_img, v_img = map(lambda t: rearrange(t, split_axis_einops, h = img_size), (q_img, k_img, v_img))
+        # similarity
+        dots_image_to_image = einsum('b x i d, b x j d -> b x i j', q_img, k_img)
+        dots_image_to_text = einsum('b x i d, b j d -> b x i j', q_img, k_text)
+        dots = torch.cat((dots_image_to_text, dots_image_to_image), dim = -1)
+        # mask so image has full attention to text, but causal along axis
+        bh, x, i, j = dots.shape
+        causal_mask = torch.ones(i, img_size, device = device).triu_(img_size - i + 1).bool()
+        causal_mask = repeat(causal_mask, 'i j -> b x i j', b = bh, x = x)
+        mask = repeat(mask, 'b j -> (b h) x i j', h = h, x = x, i = i)
+        mask = torch.cat((~mask, causal_mask), dim = -1)
+        dots.masked_fill_(mask, mask_value)
+        # attention.
+        attn = torch.softmax(dots, dim = -1)
+        # attention dropout
+        attn = self.attn_drop(attn)
+        # aggregate
+        attn_image_to_text, attn_image_to_image = attn[..., :text_len], attn[..., text_len:]
+        out_image_to_image = einsum('b x i j, b x j d -> b x i d', attn_image_to_image, v_img)
+        out_image_to_text = einsum('b x i j, b j d -> b x i d', attn_image_to_text, v_text)
+        out_image = out_image_to_image + out_image_to_text
+        # merge back axis
+        out_image = rearrange(out_image, merge_axis_einops, x = img_size)
+        # combine attended values for both text and image
+        out = torch.cat((out_text, out_image), dim = 1)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
+        out =  self.to_out(out)
+        return out[:, :n]

text2punks/data/byte-level-bpe_4k.tokenizer.json ADDED Viewed

	@@ -0,0 +1,969 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "special": true,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false
+    },
+    {
+      "id": 1,
+      "special": true,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false
+    }
+  ],
+  "normalizer": {
+    "type": "Lowercase"
+  },
+  "pre_tokenizer": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": true
+  },
+  "post_processor": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "vocab": {
+      "[PAD]": 0,
+      "[SEP]": 1,
+      ",": 2,
+      "-": 3,
+      ".": 4,
+      "0": 5,
+      "1": 6,
+      "2": 7,
+      "3": 8,
+      "4": 9,
+      "5": 10,
+      "6": 11,
+      "7": 12,
+      "?": 13,
+      "a": 14,
+      "b": 15,
+      "c": 16,
+      "d": 17,
+      "e": 18,
+      "f": 19,
+      "g": 20,
+      "h": 21,
+      "i": 22,
+      "k": 23,
+      "l": 24,
+      "m": 25,
+      "n": 26,
+      "o": 27,
+      "p": 28,
+      "r": 29,
+      "s": 30,
+      "t": 31,
+      "u": 32,
+      "v": 33,
+      "w": 34,
+      "x": 35,
+      "y": 36,
+      "z": 37,
+      "Ċ": 38,
+      "Ġ": 39,
+      "Ġa": 40,
+      "nd": 41,
+      "Ġb": 42,
+      "ha": 43,
+      "le": 44,
+      "ma": 45,
+      "Ġc": 46,
+      "ro": 47,
+      "pu": 48,
+      "ck": 49,
+      "to": 50,
+      "Ġand": 51,
+      "ack": 52,
+      "ar": 53,
+      "Ġma": 54,
+      "nk": 55,
+      "gro": 56,
+      "und": 57,
+      "Ġback": 58,
+      "punk": 59,
+      "ground": 60,
+      "Ġbackground": 61,
+      "Ġha": 62,
+      "Ġcr": 63,
+      "in": 64,
+      "Ġs": 65,
+      "Ġo": 66,
+      "pto": 67,
+      "ypto": 68,
+      "Ġcrypto": 69,
+      "Ġcryptopunk": 70,
+      "Ġof": 71,
+      "es": 72,
+      "Ġw": 73,
+      "Ġwi": 74,
+      "th": 75,
+      "ing": 76,
+      "ho": 77,
+      "lo": 78,
+      "Ġwith": 79,
+      "Ġp": 80,
+      "Ġmale": 81,
+      "Ġg": 82,
+      "Ġf": 83,
+      "ear": 84,
+      "Ġhas": 85,
+      "Ġm": 86,
+      "lu": 87,
+      "Ġt": 88,
+      "re": 89,
+      "Ġfe": 90,
+      "de": 91,
+      "wn": 92,
+      "ok": 93,
+      "hoto": 94,
+      "look": 95,
+      "Ġphoto": 96,
+      "hat": 97,
+      "Ġthat": 98,
+      "Ġmade": 99,
+      "male": 100,
+      "Ġfemale": 101,
+      "tr": 102,
+      "ir": 103,
+      "Ġhair": 104,
+      "Ġsha": 105,
+      "ple": 106,
+      "rple": 107,
+      "Ġpu": 108,
+      "Ġpurple": 109,
+      "ut": 110,
+      "en": 111,
+      "Ġgre": 112,
+      "Ġgreen": 113,
+      "Ġshad": 114,
+      "Ġblu": 115,
+      "Ġblue": 116,
+      "Ġmo": 117,
+      "looking": 118,
+      "li": 119,
+      "Ġr": 120,
+      "rown": 121,
+      "ti": 122,
+      "Ġli": 123,
+      "la": 124,
+      "ap": 125,
+      "Ġbear": 126,
+      "Ġbeard": 127,
+      "are": 128,
+      "Ġbrown": 129,
+      "wk": 130,
+      "hawk": 131,
+      "Ġmohawk": 132,
+      "ig": 133,
+      "ring": 134,
+      "Ġear": 135,
+      "Ġearring": 136,
+      "ed": 137,
+      "ey": 138,
+      "Ġey": 139,
+      "but": 140,
+      "ibut": 141,
+      "ttr": 142,
+      "Ġattr": 143,
+      "punky": 144,
+      "Ġattribut": 145,
+      "ps": 146,
+      "Ġattributes": 147,
+      "Ġcap": 148,
+      "ss": 149,
+      "tick": 150,
+      "Ġlips": 151,
+      "Ġlipstick": 152,
+      "Ġshades": 153,
+      "lass": 154,
+      "Ġn": 155,
+      "Ġeye": 156,
+      "Ġpunky": 157,
+      "Ġrare": 158,
+      "Ġho": 159,
+      "ow": 160,
+      "Ġglass": 161,
+      "Ġglasses": 162,
+      "tt": 163,
+      "Ġshadow": 164,
+      "Ġ3": 165,
+      "Ġgo": 166,
+      "or": 167,
+      "Ġd": 168,
+      "mal": 169,
+      "Ġpi": 170,
+      "Ġstr": 171,
+      "he": 172,
+      "Ġclo": 173,
+      "Ġclown": 174,
+      "ke": 175,
+      "od": 176,
+      "ark": 177,
+      "Ġdark": 178,
+      "ld": 179,
+      "ce": 180,
+      "Ġcig": 181,
+      "arett": 182,
+      "Ġcigarett": 183,
+      "Ġcigarette": 184,
+      "lack": 185,
+      "Ġblack": 186,
+      "and": 187,
+      "Ġnor": 188,
+      "Ġnormal": 189,
+      "Ġ2": 190,
+      "nt": 191,
+      "ront": 192,
+      "Ġfront": 193,
+      "Ġlooking": 194,
+      "car": 195,
+      "cut": 196,
+      "ela": 197,
+      "on": 198,
+      "olu": 199,
+      "ted": 200,
+      "up": 201,
+      "xela": 202,
+      "Ġlo": 203,
+      "Ġlook": 204,
+      "Ġup": 205,
+      "Ġsing": 206,
+      "Ġscar": 207,
+      "esolu": 208,
+      "how": 209,
+      "Ġresolu": 210,
+      "tion": 211,
+      "Ġlike": 212,
+      "Ġgood": 213,
+      "Ġpixela": 214,
+      "cute": 215,
+      "Ġlow": 216,
+      "Ġsingle": 217,
+      "Ġscarce": 218,
+      "Ġresolution": 219,
+      "Ġpixelated": 220,
+      "fu": 221,
+      "nn": 222,
+      "funn": 223,
+      "funny": 224,
+      "Ġeyes": 225,
+      "Ġhe": 226,
+      "at": 227,
+      "Ġv": 228,
+      "aig": 229,
+      "ht": 230,
+      "Ġstraig": 231,
+      "Ġstraight": 232,
+      "er": 233,
+      "Ġwild": 234,
+      "ad": 235,
+      "Ġhead": 236,
+      "Ġhot": 237,
+      "Ġbig": 238,
+      "ic": 239,
+      "Ġre": 240,
+      "Ġmole": 241,
+      "an": 242,
+      "mp": 243,
+      "sy": 244,
+      "us": 245,
+      "Ġner": 246,
+      "Ġnerd": 247,
+      "nde": 248,
+      "Ġblo": 249,
+      "Ġblonde": 250,
+      "im": 251,
+      "ned": 252,
+      "rned": 253,
+      "Ġrim": 254,
+      "Ġhorned": 255,
+      "Ġhat": 256,
+      "gu": 257,
+      "lar": 258,
+      "Ġregu": 259,
+      "Ġregular": 260,
+      "Ġclass": 261,
+      "Ġclassic": 262,
+      "Ġband": 263,
+      "ana": 264,
+      "Ġbandana": 265,
+      "sk": 266,
+      "Ġmask": 267,
+      "ingy": 268,
+      "Ġstringy": 269,
+      "ch": 270,
+      "Ġpat": 271,
+      "Ġpatch": 272,
+      "essy": 273,
+      "Ġmessy": 274,
+      "ved": 275,
+      "Ġshaved": 276,
+      "ru": 277,
+      "Ġfru": 278,
+      "mpy": 279,
+      "Ġfrumpy": 280,
+      "Ġth": 281,
+      "Ġthin": 282,
+      "Ġsp": 283,
+      "itt": 284,
+      "kn": 285,
+      "Ġkn": 286,
+      "itted": 287,
+      "Ġknitted": 288,
+      "az": 289,
+      "Ġcraz": 290,
+      "Ġcrazy": 291,
+      "band": 292,
+      "Ġheadband": 293,
+      "ie": 294,
+      "ta": 295,
+      "Ġsmal": 296,
+      "Ġsmall": 297,
+      "pe": 298,
+      "Ġvr": 299,
+      "Ġ4": 300,
+      "hain": 301,
+      "Ġchain": 302,
+      "Ġpipe": 303,
+      "ak": 304,
+      "cho": 305,
+      "eak": 306,
+      "ike": 307,
+      "ncho": 308,
+      "toncho": 309,
+      "Ġpeak": 310,
+      "Ġmut": 311,
+      "Ġspike": 312,
+      "tonchops": 313,
+      "Ġmuttonchops": 314,
+      "ag": 315,
+      "rag": 316,
+      "Ġdo": 317,
+      "Ġgoat": 318,
+      "che": 319,
+      "Ġmus": 320,
+      "tache": 321,
+      "Ġmustache": 322,
+      "ur": 323,
+      "io": 324,
+      "xur": 325,
+      "Ġlu": 326,
+      "ious": 327,
+      "xurious": 328,
+      "Ġluxurious": 329,
+      "hin": 330,
+      "str": 331,
+      "Ġchin": 332,
+      "strap": 333,
+      "Ġchinstrap": 334,
+      "ape": 335,
+      "Ġvape": 336,
+      "bar": 337,
+      "ndle": 338,
+      "Ġhandle": 339,
+      "bars": 340,
+      "Ġhandlebars": 341,
+      "Ġfrown": 342,
+      "Ġhood": 343,
+      "Ġhoodie": 344,
+      "war": 345,
+      "Ġfor": 346,
+      "ward": 347,
+      "Ġforward": 348,
+      "il": 349,
+      "ile": 350,
+      "mile": 351,
+      "Ġsmile": 352,
+      "Ġno": 353,
+      "se": 354,
+      "Ġnose": 355,
+      "oli": 356,
+      "Ġpoli": 357,
+      "Ġpolice": 358,
+      "dor": 359,
+      "Ġfedor": 360,
+      "Ġfedora": 361,
+      "ass": 362,
+      "Ġtass": 363,
+      "Ġtassle": 364,
+      "al": 365,
+      "Ġmed": 366,
+      "ical": 367,
+      "Ġmedical": 368,
+      "Ġgold": 369,
+      "ver": 370,
+      "Ġsil": 371,
+      "Ġsilver": 372,
+      "amp": 373,
+      "ire": 374,
+      "lf": 375,
+      "ob": 376,
+      "Ġbob": 377,
+      "Ġhalf": 378,
+      "Ġvamp": 379,
+      "Ġred": 380,
+      "Ġvampire": 381,
+      "bo": 382,
+      "Ġcow": 383,
+      "boy": 384,
+      "Ġcowboy": 385,
+      "hi": 386,
+      "te": 387,
+      "Ġwhi": 388,
+      "Ġwhite": 389,
+      "rt": 390,
+      "Ġsho": 391,
+      "Ġshort": 392,
+      "ek": 393,
+      "Ġro": 394,
+      "Ġche": 395,
+      "eks": 396,
+      "Ġrosy": 397,
+      "Ġcheeks": 398,
+      "ot": 399,
+      "Ġspot": 400,
+      "Ġspots": 401,
+      "Ġto": 402,
+      "Ġtop": 403,
+      "Ġpink": 404,
+      "Ġpig": 405,
+      "tail": 406,
+      "Ġpigtail": 407,
+      "Ġpigtails": 408,
+      "Ġz": 409,
+      "bie": 410,
+      "mbie": 411,
+      "ombie": 412,
+      "Ġzombie": 413,
+      "eld": 414,
+      "gg": 415,
+      "les": 416,
+      "Ġweld": 417,
+      "Ġgogg": 418,
+      "Ġwelding": 419,
+      "Ġgoggles": 420,
+      "ee": 421,
+      "uck": 422,
+      "Ġbuck": 423,
+      "Ġtee": 424,
+      "Ġteeth": 425,
+      "Ġ1": 426,
+      "ge": 427,
+      "ide": 428,
+      "ran": 429,
+      "Ġside": 430,
+      "Ġoran": 431,
+      "Ġorange": 432,
+      "Ġattribute": 433,
+      "iar": 434,
+      "Ġtiar": 435,
+      "Ġtiara": 436,
+      "et": 437,
+      "lm": 438,
+      "lot": 439,
+      "Ġpilot": 440,
+      "Ġhelm": 441,
+      "Ġhelmet": 442,
+      "Ġcho": 443,
+      "ker": 444,
+      "Ġchoker": 445,
+      "ean": 446,
+      "Ġbean": 447,
+      "Ġbeanie": 448,
+      "Ġ5": 449,
+      "Ġape": 450,
+      "Ġali": 451,
+      "Ġalien": 452,
+      "Ġ6": 453,
+      "imple": 454,
+      "Ġ0": 455,
+      "Ġsimple": 456,
+      "Ġfeat": 457,
+      "ures": 458,
+      "Ġfeatures": 459,
+      "ace": 460,
+      "ase": 461,
+      "ero": 462,
+      "lin": 463,
+      "simple": 464,
+      "Ġaver": 465,
+      "Ġbut": 466,
+      "Ġbare": 467,
+      "Ġbase": 468,
+      "thing": 469,
+      "Ġface": 470,
+      "age": 471,
+      "Ġnothing": 472,
+      "Ġzero": 473,
+      "line": 474,
+      "Ġaverage": 475,
+      "Ġ7": 476
+    },
+    "merges": [
+      "Ġ a",
+      "n d",
+      "Ġ b",
+      "h a",
+      "l e",
+      "m a",
+      "Ġ c",
+      "r o",
+      "p u",
+      "c k",
+      "t o",
+      "Ġa nd",
+      "a ck",
+      "a r",
+      "Ġ ma",
+      "n k",
+      "g ro",
+      "u nd",
+      "Ġb ack",
+      "pu nk",
+      "gro und",
+      "Ġback ground",
+      "Ġ ha",
+      "Ġc r",
+      "i n",
+      "Ġ s",
+      "Ġ o",
+      "p to",
+      "y pto",
+      "Ġcr ypto",
+      "Ġcrypto punk",
+      "Ġo f",
+      "e s",
+      "Ġ w",
+      "Ġw i",
+      "t h",
+      "in g",
+      "h o",
+      "l o",
+      "Ġwi th",
+      "Ġ p",
+      "Ġma le",
+      "Ġ g",
+      "Ġ f",
+      "e ar",
+      "Ġha s",
+      "Ġ m",
+      "l u",
+      "Ġ t",
+      "r e",
+      "Ġf e",
+      "d e",
+      "w n",
+      "o k",
+      "ho to",
+      "lo ok",
+      "Ġp hoto",
+      "ha t",
+      "Ġt hat",
+      "Ġma de",
+      "ma le",
+      "Ġfe male",
+      "t r",
+      "i r",
+      "Ġha ir",
+      "Ġs ha",
+      "p le",
+      "r ple",
+      "Ġ pu",
+      "Ġpu rple",
+      "u t",
+      "e n",
+      "Ġg re",
+      "Ġgre en",
+      "Ġsha d",
+      "Ġb lu",
+      "Ġblu e",
+      "Ġm o",
+      "look ing",
+      "l i",
+      "Ġ r",
+      "ro wn",
+      "t i",
+      "Ġ li",
+      "l a",
+      "a p",
+      "Ġb ear",
+      "Ġbear d",
+      "ar e",
+      "Ġb rown",
+      "w k",
+      "ha wk",
+      "Ġmo hawk",
+      "i g",
+      "r ing",
+      "Ġ ear",
+      "Ġear ring",
+      "e d",
+      "e y",
+      "Ġ ey",
+      "b ut",
+      "i but",
+      "t tr",
+      "Ġa ttr",
+      "punk y",
+      "Ġattr ibut",
+      "p s",
+      "Ġattribut es",
+      "Ġc ap",
+      "s s",
+      "ti ck",
+      "Ġli ps",
+      "Ġlips tick",
+      "Ġshad es",
+      "la ss",
+      "Ġ n",
+      "Ġey e",
+      "Ġ punky",
+      "Ġr are",
+      "Ġ ho",
+      "o w",
+      "Ġg lass",
+      "Ġglass es",
+      "t t",
+      "Ġshad ow",
+      "Ġ 3",
+      "Ġg o",
+      "o r",
+      "Ġ d",
+      "ma l",
+      "Ġp i",
+      "Ġs tr",
+      "h e",
+      "Ġc lo",
+      "Ġclo wn",
+      "k e",
+      "o d",
+      "ar k",
+      "Ġd ark",
+      "l d",
+      "c e",
+      "Ġc ig",
+      "are tt",
+      "Ġcig arett",
+      "Ġcigarett e",
+      "l ack",
+      "Ġb lack",
+      "a nd",
+      "Ġn or",
+      "Ġnor mal",
+      "Ġ 2",
+      "n t",
+      "ro nt",
+      "Ġf ront",
+      "Ġ looking",
+      "c ar",
+      "c ut",
+      "e la",
+      "o n",
+      "o lu",
+      "t ed",
+      "u p",
+      "x ela",
+      "Ġ lo",
+      "Ġ look",
+      "Ġ up",
+      "Ġs ing",
+      "Ġs car",
+      "es olu",
+      "ho w",
+      "Ġr esolu",
+      "ti on",
+      "Ġli ke",
+      "Ġgo od",
+      "Ġpi xela",
+      "cut e",
+      "Ġlo w",
+      "Ġsing le",
+      "Ġscar ce",
+      "Ġresolu tion",
+      "Ġpixela ted",
+      "f u",
+      "n n",
+      "fu nn",
+      "funn y",
+      "Ġey es",
+      "Ġ he",
+      "a t",
+      "Ġ v",
+      "a ig",
+      "h t",
+      "Ġstr aig",
+      "Ġstraig ht",
+      "e r",
+      "Ġwi ld",
+      "a d",
+      "Ġhe ad",
+      "Ġho t",
+      "Ġb ig",
+      "i c",
+      "Ġ re",
+      "Ġmo le",
+      "a n",
+      "m p",
+      "s y",
+      "u s",
+      "Ġn er",
+      "Ġner d",
+      "nd e",
+      "Ġb lo",
+      "Ġblo nde",
+      "i m",
+      "n ed",
+      "r ned",
+      "Ġr im",
+      "Ġho rned",
+      "Ġha t",
+      "g u",
+      "l ar",
+      "Ġre gu",
+      "Ġregu lar",
+      "Ġc lass",
+      "Ġclass ic",
+      "Ġb and",
+      "an a",
+      "Ġband ana",
+      "s k",
+      "Ġma sk",
+      "ing y",
+      "Ġstr ingy",
+      "c h",
+      "Ġp at",
+      "Ġpat ch",
+      "es sy",
+      "Ġm essy",
+      "v ed",
+      "Ġsha ved",
+      "r u",
+      "Ġf ru",
+      "mp y",
+      "Ġfru mpy",
+      "Ġ th",
+      "Ġth in",
+      "Ġs p",
+      "i tt",
+      "k n",
+      "Ġ kn",
+      "itt ed",
+      "Ġkn itted",
+      "a z",
+      "Ġcr az",
+      "Ġcraz y",
+      "b and",
+      "Ġhead band",
+      "i e",
+      "t a",
+      "Ġs mal",
+      "Ġsmal l",
+      "p e",
+      "Ġv r",
+      "Ġ 4",
+      "ha in",
+      "Ġc hain",
+      "Ġpi pe",
+      "a k",
+      "c ho",
+      "e ak",
+      "i ke",
+      "n cho",
+      "to ncho",
+      "Ġp eak",
+      "Ġm ut",
+      "Ġsp ike",
+      "toncho ps",
+      "Ġmut tonchops",
+      "a g",
+      "r ag",
+      "Ġd o",
+      "Ġgo at",
+      "c he",
+      "Ġm us",
+      "ta che",
+      "Ġmus tache",
+      "u r",
+      "i o",
+      "x ur",
+      "Ġ lu",
+      "io us",
+      "xur ious",
+      "Ġlu xurious",
+      "h in",
+      "s tr",
+      "Ġc hin",
+      "str ap",
+      "Ġchin strap",
+      "ap e",
+      "Ġv ape",
+      "b ar",
+      "nd le",
+      "Ġha ndle",
+      "bar s",
+      "Ġhandle bars",
+      "Ġf rown",
+      "Ġho od",
+      "Ġhood ie",
+      "w ar",
+      "Ġf or",
+      "war d",
+      "Ġfor ward",
+      "i l",
+      "i le",
+      "m ile",
+      "Ġs mile",
+      "Ġn o",
+      "s e",
+      "Ġno se",
+      "o li",
+      "Ġp oli",
+      "Ġpoli ce",
+      "d or",
+      "Ġfe dor",
+      "Ġfedor a",
+      "a ss",
+      "Ġt ass",
+      "Ġtass le",
+      "a l",
+      "Ġm ed",
+      "ic al",
+      "Ġmed ical",
+      "Ġgo ld",
+      "v er",
+      "Ġs il",
+      "Ġsil ver",
+      "a mp",
+      "i re",
+      "l f",
+      "o b",
+      "Ġb ob",
+      "Ġha lf",
+      "Ġv amp",
+      "Ġre d",
+      "Ġvamp ire",
+      "b o",
+      "Ġc ow",
+      "bo y",
+      "Ġcow boy",
+      "h i",
+      "t e",
+      "Ġw hi",
+      "Ġwhi te",
+      "r t",
+      "Ġs ho",
+      "Ġsho rt",
+      "e k",
+      "Ġ ro",
+      "Ġc he",
+      "ek s",
+      "Ġro sy",
+      "Ġche eks",
+      "o t",
+      "Ġsp ot",
+      "Ġspot s",
+      "Ġ to",
+      "Ġto p",
+      "Ġpi nk",
+      "Ġp ig",
+      "ta il",
+      "Ġpig tail",
+      "Ġpigtail s",
+      "Ġ z",
+      "b ie",
+      "m bie",
+      "o mbie",
+      "Ġz ombie",
+      "e ld",
+      "g g",
+      "le s",
+      "Ġw eld",
+      "Ġgo gg",
+      "Ġweld ing",
+      "Ġgogg les",
+      "e e",
+      "u ck",
+      "Ġb uck",
+      "Ġt ee",
+      "Ġtee th",
+      "Ġ 1",
+      "g e",
+      "i de",
+      "r an",
+      "Ġs ide",
+      "Ġo ran",
+      "Ġoran ge",
+      "Ġattribut e",
+      "i ar",
+      "Ġt iar",
+      "Ġtiar a",
+      "e t",
+      "l m",
+      "lo t",
+      "Ġpi lot",
+      "Ġhe lm",
+      "Ġhelm et",
+      "Ġc ho",
+      "ke r",
+      "Ġcho ker",
+      "e an",
+      "Ġb ean",
+      "Ġbean ie",
+      "Ġ 5",
+      "Ġa pe",
+      "Ġa li",
+      "Ġali en",
+      "Ġ 6",
+      "im ple",
+      "Ġ 0",
+      "Ġs imple",
+      "Ġfe at",
+      "ur es",
+      "Ġfeat ures",
+      "a ce",
+      "a se",
+      "e ro",
+      "l in",
+      "s imple",
+      "Ġa ver",
+      "Ġb ut",
+      "Ġb are",
+      "Ġb ase",
+      "th ing",
+      "Ġf ace",
+      "ag e",
+      "Ġno thing",
+      "Ġz ero",
+      "lin e",
+      "Ġaver age",
+      "Ġ 7"
+    ]
+  }
+}

text2punks/data/codebook.pt ADDED Viewed

Binary file (1.39 kB). View file

text2punks/loader.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import numpy as np
+from PIL import Image, UnidentifiedImageError
+from pathlib import Path
+from random import randint, choice
+import torch
+from torch.utils.data import Dataset
+class TextImageDataset(Dataset):
+    def __init__(self,
+                 folder,
+                 text_len=40,
+                 truncate_captions=False,
+                 text_tokenizer=None,
+                 image_tokenizer=None,
+                 shuffle=False
+                 ):
+        """
+        @param folder: Folder containing images and text files matched by their paths' respective "stem"
+        @param truncate_captions: Rather than throw an exception, captions which are too long will be truncated.
+        """
+        super().__init__()
+        self.shuffle = shuffle
+        path = Path(folder)
+        text_files = [*path.glob('**/*.txt')]
+        image_files = [
+            *path.glob('**/*.png'), *path.glob('**/*.jpg'),
+            *path.glob('**/*.jpeg'), *path.glob('**/*.bmp')
+        ]
+        text_files = {text_file.stem: text_file for text_file in text_files}
+        image_files = {image_file.stem: image_file for image_file in image_files}
+        keys = (image_files.keys() & text_files.keys())
+        self.keys = list(keys)
+        self.text_files = {k: v for k, v in text_files.items() if k in keys}
+        self.image_files = {k: v for k, v in image_files.items() if k in keys}
+        self.text_len = text_len
+        self.truncate_captions = truncate_captions
+        self.text_tokenizer = text_tokenizer
+        self.image_tokenizer = image_tokenizer
+    def __len__(self):
+        return len(self.keys)
+    def random_sample(self):
+        return self.__getitem__(randint(0, self.__len__() - 1))
+    def sequential_sample(self, ind):
+        if ind >= self.__len__() - 1:
+            return self.__getitem__(0)
+        return self.__getitem__(ind + 1)
+    def skip_sample(self, ind):
+        if self.shuffle:
+            return self.random_sample()
+        return self.sequential_sample(ind=ind)
+    def __getitem__(self, ind):
+        key = self.keys[ind]
+        text_file = self.text_files[key]
+        image_file = self.image_files[key]
+        descriptions = text_file.read_text().split('\n')
+        descriptions = list(filter(lambda t: len(t) > 0, descriptions))
+        try:
+            description = choice(descriptions)
+        except IndexError as zero_captions_in_file_ex:
+            print(f"An exception occurred trying to load file {text_file}.")
+            print(f"Skipping index {ind}")
+            return self.skip_sample(ind)
+        tokenized_text = self.text_tokenizer.tokenize(
+            description,
+            self.text_len,
+            truncate_text=self.truncate_captions
+        ).squeeze(0)
+        try:
+            image = Image.open(image_file).convert('RGB')
+            pixels = np.array(image).reshape(-1, 3)
+            tokenized_image = [self.image_tokenizer[str(idx)] for idx in pixels]
+            tokenized_image = torch.tensor(tokenized_image)
+        except (UnidentifiedImageError, OSError) as corrupt_image_exceptions:
+            print(f"An exception occurred trying to load file {image_file}.")
+            print(f"Skipping index {ind}")
+            return self.skip_sample(ind)
+        # Success
+        return tokenized_text, tokenized_image

text2punks/text2punk.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import math
+from einops import rearrange, repeat
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from axial_positional_embedding import AxialPositionalEmbedding
+from text2punks.transformer import Transformer
+# helpers fns
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+def eval_decorator(fn):
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+    return inner
+# sampling helpers fn
+def top_k(logits, thres = 0.5):
+    num_logits = logits.shape[-1]
+    k = max(int((1 - thres) * num_logits), 1)
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+# main CLIP class
+class CLIP(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim_text = 512,
+        dim_image = 512,
+        dim_latent = 512,
+        num_text_tokens = 10000,
+        text_enc_depth = 6,
+        text_seq_len = 256,
+        text_heads = 8,
+        num_visual_tokens = 256,
+        visual_enc_depth = 6,
+        visual_image_seq_len = 256,
+        visual_image_size = 24,
+        visual_heads = 8,
+        attn_pdrop = 0.1,
+        resid_pdrop = 0.1,
+        embd_pdrop = 0.1,
+        ff_dropout = 0.1,
+        attn_types = None
+    ):
+        super().__init__()
+        # Texts
+        self.text_emb = nn.Embedding(num_text_tokens, dim_text)
+        self.text_pos_emb = nn.Embedding(text_seq_len, dim_text)
+        self.text_transformer = Transformer(
+            dim = dim_text,
+            causal = False,
+            seq_len = text_seq_len,
+            depth = text_enc_depth,
+            heads = text_heads,
+            dim_head = dim_text // text_heads,
+            attn_dropout = attn_pdrop,
+            resid_dropout = resid_pdrop,
+            embd_dropout = embd_pdrop,
+            ff_dropout = ff_dropout,
+            attn_types = attn_types
+        )
+        self.text_ln = nn.LayerNorm(dim_text)
+        self.to_text_latent = nn.Linear(dim_text, dim_latent, bias = False)
+        # Images
+        self.image_emb = nn.Embedding(num_visual_tokens, dim_image)
+        self.image_pos_emb = nn.Embedding(visual_image_seq_len, dim_image)
+        self.visual_transformer = Transformer(
+            dim = dim_image,
+            causal = False,
+            seq_len = visual_image_seq_len,
+            depth = visual_enc_depth,
+            heads = visual_heads,
+            dim_head = dim_image // visual_heads,
+            attn_dropout = attn_pdrop,
+            resid_dropout = resid_pdrop,
+            embd_dropout = embd_pdrop,
+            ff_dropout = ff_dropout,
+            attn_types = attn_types,
+            image_size = visual_image_size,
+        )
+        self.image_ln = nn.LayerNorm(dim_image)
+        self.to_visual_latent = nn.Linear(dim_image, dim_latent, bias = False)
+        self.temperature = nn.Parameter(torch.ones([]) * math.log(1 / 0.07))
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(
+        self,
+        text,
+        image,
+        return_loss = False
+    ):
+        b, device= text.shape[0], text.device
+        text_emb = self.text_emb(text)
+        text_emb += self.text_pos_emb(torch.arange(text.shape[1], device = device))
+        image_emb = self.image_emb(image)
+        image_emb += self.image_pos_emb(torch.arange(image.shape[1], device = device))
+        enc_text = self.text_transformer(text_emb)
+        enc_image = self.visual_transformer(image_emb)
+        text_latents = enc_text.mean(dim = 1)
+        image_latents = enc_image.mean(dim = 1)
+        text_latents = self.text_ln(text_latents)
+        image_latents = self.image_ln(image_latents)
+        text_latents = self.to_text_latent(text_latents)
+        image_latents = self.to_visual_latent(image_latents)
+        text_latents, image_latents = map(lambda t: F.normalize(t, p = 2, dim = -1), (text_latents, image_latents))
+        temp = self.temperature.exp()
+        if not return_loss:
+            sim = einsum('n d, n d -> n', text_latents, image_latents) * temp
+            return sim
+        sim = einsum('i d, j d -> i j', text_latents, image_latents) * temp
+        labels = torch.arange(b, device = device)
+        loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2
+        return loss
+# main Text2Punks class
+class Text2Punks(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_embd,
+        n_layer = 12,
+        n_head = 12,
+        d_head = 64,
+        num_text_tokens = 10000,
+        text_seq_len = 256,
+        num_image_tokens = 222,
+        image_seq_len = 576,
+        image_size = 24,
+        attn_pdrop = 0.1,
+        resid_pdrop = 0.1,
+        embd_pdrop = 0.1,
+        ff_dropout = 0.1,
+        attn_types = None,
+        loss_img_weight = 7,
+        loss_txt_weight = 7,
+    ):
+        super().__init__()
+        num_text_tokens = num_text_tokens + text_seq_len  # reserve unique padding tokens for each position (text seq len)
+        self.text_emb = nn.Embedding(num_text_tokens, n_embd)
+        self.image_emb = nn.Embedding(num_image_tokens, n_embd)
+        self.text_pos_emb = nn.Embedding(text_seq_len + 1, n_embd) # +1 for <bos> a.k.a <sos>
+        # self.image_pos_emb = nn.Embedding(image_seq_len, n_embd)
+        self.image_pos_emb = nn.Parameter(torch.zeros(1, image_seq_len, n_embd))
+        # self.image_pos_emb = AxialPositionalEmbedding(n_embd, axial_shape=(image_size, image_size))
+        self.num_text_tokens = num_text_tokens # for offsetting logits index and calculating cross entropy loss
+        self.num_image_tokens = num_image_tokens
+        self.text_seq_len = text_seq_len
+        self.image_seq_len = image_seq_len
+        seq_len = text_seq_len + image_seq_len
+        total_tokens = num_text_tokens + num_image_tokens
+        self.total_seq_len = seq_len
+        self.total_tokens = total_tokens
+        self.transformer = Transformer(
+            dim = n_embd,
+            causal = True,
+            seq_len = seq_len,
+            depth = n_layer,
+            heads = n_head,
+            dim_head = d_head,
+            attn_dropout = attn_pdrop,
+            resid_dropout = resid_pdrop,
+            embd_dropout = embd_pdrop,
+            ff_dropout = ff_dropout,
+            attn_types = attn_types,
+            image_size = image_size,
+        )
+        self.to_logits = nn.Sequential(
+            nn.LayerNorm(n_embd),
+            nn.Linear(n_embd, self.total_tokens),
+        )
+        seq_range = torch.arange(seq_len)
+        logits_range = torch.arange(total_tokens)
+        seq_range = rearrange(seq_range, 'n -> () n ()')
+        logits_range = rearrange(logits_range, 'd -> () () d')
+        logits_mask = (
+            ((seq_range >= text_seq_len) & (logits_range < num_text_tokens)) |
+            ((seq_range < text_seq_len) & (logits_range >= num_text_tokens))
+        )
+        self.register_buffer('logits_mask', logits_mask, persistent=False)
+        self.loss_img_weight = loss_img_weight
+        self.loss_txt_weight = loss_txt_weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    @torch.no_grad()
+    @eval_decorator
+    def generate_images(
+        self,
+        text,
+        decoder,
+        *,
+        clip = None,
+        filter_thres = 0.5,
+        temperature = 1.,
+        img = None,
+        num_init_img_tokens = None
+    ):
+        text_seq_len, image_seq_len, num_text_tokens = self.text_seq_len, self.image_seq_len, self.num_text_tokens
+        total_len = text_seq_len + image_seq_len
+        batch = text.shape[0]
+        text = text[:, :text_seq_len] # make sure text is within bounds
+        out = text
+        if exists(img):
+            assert img.shape[1] == image_seq_len, f'input image must have the correct image size {image_seq_len}'
+            num_img_tokens = default(num_init_img_tokens, int(0.4375 * image_seq_len))  # OpenAI used 14 * 32 initial tokens to prime
+            assert num_img_tokens < image_seq_len, 'number of initial image tokens for priming must be less than the total image token sequence length'
+            trunc_img = img[:, :num_img_tokens]
+            out = torch.cat((out, trunc_img), dim = -1)
+        for cur_len in range(out.shape[1], total_len):
+            is_image = cur_len >= text_seq_len
+            text, image = out[:, :text_seq_len], out[:, text_seq_len:]
+            logits = self(text, image)[:, -1, :]
+            filtered_logits = top_k(logits, thres = filter_thres)
+            probs = F.softmax(filtered_logits / temperature, dim = -1)
+            sample = torch.multinomial(probs, 1)
+            sample -= (num_text_tokens if is_image else 0) # offset sampled token if it is an image token, since logit space is composed of text and then image tokens
+            out = torch.cat((out, sample), dim=-1)
+        text_seq = out[:, :text_seq_len]
+        img_seq = out[:, -image_seq_len:]
+        scores = None
+        if exists(clip):
+            scores = clip(text_seq, img_seq, return_loss = False)
+        img_seq = repeat(img_seq, 'b p -> b p c', c=3)
+        decoder = repeat(decoder, 'p c -> b p c', b=batch)
+        images = torch.gather(decoder, 1, img_seq)
+        images = rearrange(images, 'b (h w) c-> b c h w', h=24, w =24)
+        images = images.float()
+        return images, scores
+    def forward(
+        self,
+        text,
+        image = None,
+        return_loss = False
+    ):
+        assert text.shape[-1] == self.text_seq_len, f'the length {text.shape[-1]} of the text tokens you passed in does not have the correct length ({self.text_seq_len})'
+        device, total_seq_len = text.device, self.total_seq_len
+        text_range = torch.arange(self.text_seq_len, device = device) + (self.num_text_tokens - self.text_seq_len)
+        text = torch.where(text == 0, text_range, text)
+        text = F.pad(text, (1, 0), value = 0) # add <bos>
+        tokens = self.text_emb(text)
+        tokens += self.text_pos_emb(torch.arange(text.shape[1], device = device))
+        seq_len = tokens.shape[1]
+        image_len = image.shape[1]
+        image_emb = self.image_emb(image)
+        # image_emb += self.image_pos_emb(torch.arange(image_len, device = device))
+        image_emb += self.image_pos_emb[:, :image_len, :]
+        # image_emb += self.image_pos_emb(image_emb)
+        tokens = torch.cat((tokens, image_emb), dim = 1)
+        seq_len += image_len
+        # when training, if the length exceeds the total text + image length
+        # remove the last token, since it needs not to be trained
+        if tokens.shape[1] > total_seq_len:
+            seq_len -= 1
+            tokens = tokens[:, :-1]
+        out = self.transformer(tokens)
+        logits = self.to_logits(out)
+        # mask logits to make sure text predicts text (except last token), and image predicts image
+        logits_mask = self.logits_mask[:, :seq_len]
+        max_neg_value = -torch.finfo(logits.dtype).max
+        logits.masked_fill_(logits_mask, max_neg_value)
+        if not return_loss:
+            return logits
+        assert exists(image), 'when training, image must be supplied'
+        offsetted_image = image + self.num_text_tokens
+        labels = torch.cat((text[:, 1:], offsetted_image), dim = 1)
+        logits = rearrange(logits, 'b n c -> b c n')
+        loss_text = F.cross_entropy(logits[:, :, :self.text_seq_len], labels[:, :self.text_seq_len])
+        loss_img = F.cross_entropy(logits[:, :, self.text_seq_len:], labels[:, self.text_seq_len:])
+        loss = (self.loss_txt_weight * loss_text + self.loss_img_weight * loss_img) / (self.loss_img_weight + self.loss_txt_weight)
+        return loss, loss_text, loss_img

text2punks/tokenizer.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import html
+import ftfy
+import regex as re
+from pathlib import Path
+import torch
+from functools import lru_cache
+import youtokentome as yttm
+from tokenizers import Tokenizer
+from tokenizers.processors import ByteLevel
+# OpenAI simple tokenizer
+@lru_cache()
+def default_bpe(bpe_path = "data/bpe_simple_vocab_16e6.txt"):
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), bpe_path)
+@lru_cache()
+def bytes_to_unicode():
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = Path(bpe_path).read_text(encoding='utf8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.vocab_size = 49408
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + '</w>'
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens, remove_start_end = True):
+        if torch.is_tensor(tokens):
+            tokens = tokens.tolist()
+        if remove_start_end:
+            tokens = [token for token in tokens if token not in (49406, 40407, 0)]
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+    def tokenize(self, texts, context_length = 256, truncate_text = False):
+        if isinstance(texts, str):
+            texts = [texts]
+        all_tokens = [self.encode(text) for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate_text:
+                    tokens = tokens[:context_length]
+                else:
+                    raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        return result
+# txt_tokenizer = SimpleTokenizer()
+# huggingface tokenizer
+class HugTokenizer:
+    def __init__(self, bpe_path):
+        bpe_path = Path(default_bpe(bpe_path = bpe_path))
+        assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist'
+        tokenizer = Tokenizer.from_file(str(bpe_path))
+        tokenizer.post_processor = ByteLevel(trim_offsets = True)
+        self.tokenizer = tokenizer
+        self.vocab_size = tokenizer.get_vocab_size()
+    def decode(self, tokens):
+        if torch.is_tensor(tokens):
+            tokens = tokens.tolist()
+        tokens = [token for token in tokens if token not in (0,)]
+        return self.tokenizer.decode(tokens, skip_special_tokens = True)
+    def encode(self, text):
+        return self.tokenizer.encode(text).ids
+    def tokenize(self, texts, context_length = 256, truncate_text = False):
+        if isinstance(texts, str):
+            texts = [texts]
+        all_tokens = [self.encode(text) for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate_text:
+                    tokens = tokens[:context_length]
+                else:
+                    raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        return result
+txt_tokenizer = HugTokenizer(bpe_path = "data/byte-level-bpe_4k.tokenizer.json")
+# yttm tokenizer
+class YttmTokenizer:
+    def __init__(self, bpe_path = None):
+        bpe_path = Path(default_bpe(bpe_path = bpe_path))
+        assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist'
+        tokenizer = yttm.BPE(model = str(bpe_path))
+        self.tokenizer = tokenizer
+        self.vocab_size = tokenizer.vocab_size()
+    def decode(self, tokens):
+        if torch.is_tensor(tokens):
+            tokens = tokens.tolist()
+        return self.tokenizer.decode(tokens, ignore_ids = [0])
+    def encode(self, texts):
+        encoded = self.tokenizer.encode(texts, output_type = yttm.OutputType.ID)
+        return list(map(torch.tensor, encoded))
+    def tokenize(self, texts, context_length = 256, truncate_text = False):
+        if isinstance(texts, str):
+            texts = [texts]
+        all_tokens = self.encode(texts)
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate_text:
+                    tokens = tokens[:context_length]
+                else:
+                    raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = tokens.detach().clone()
+        return result
+# txt_tokenizer = YttmTokenizer(bpe_path = "data/byte-level-bpe.tokenizer.json")

text2punks/transformer.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from functools import partial
+from itertools import islice, cycle
+from torch import nn
+from text2punks.attention import Attention, SparseAxialCausalAttention
+# helpers
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def cast_tuple(val, depth = 1):
+    if isinstance(val, list):
+        val = tuple(val)
+    return val if isinstance(val, tuple) else (val,) * depth
+# classes
+class SequentialSequence(nn.Module):
+    def __init__(self, layers):
+        super().__init__()
+        self.layers = layers
+    def forward(self, x):
+        for (f, g) in list(self.layers):
+            x = x + f(x)
+            x = x + g(x)
+        return x
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(self.norm(x), **kwargs)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dropout = 0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * 4, dim)
+        )
+        # the order of dropout nn.Linear(4 * n_embd, n_embd) vs nn.Dropout(resid_pdrop)
+    def forward(self, x):
+        return self.net(x)
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        seq_len,
+        causal = True,
+        heads = 8,
+        dim_head = 64,
+        attn_dropout = 0.,
+        resid_dropout = 0.,
+        embd_dropout = 0.,
+        ff_dropout = 0.,
+        image_size = 24,
+        attn_types = None,
+    ):
+        super().__init__()
+        layers = nn.ModuleList([])
+        attn_types = default(attn_types, ('full',))
+        attn_types = cast_tuple(attn_types)
+        attn_type_layer = islice(cycle(attn_types), depth)
+        for attn_type in attn_type_layer:
+            if attn_type == 'full':
+                attn_class = partial(Attention, causal = causal)
+            elif attn_type == 'axial_row':
+                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 0, image_size = image_size)
+            elif attn_type == 'axial_col':
+                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 1, image_size = image_size)
+            else:
+                raise ValueError(f'attention type "{attn_type}" is not valid')
+            attn = attn_class(dim, seq_len = seq_len, heads = heads, dim_head = dim_head, attn_dropout = attn_dropout, resid_dropout = resid_dropout)
+            layers.append(nn.ModuleList([
+                PreNorm(dim, attn),
+                PreNorm(dim, FeedForward(dim, dropout = ff_dropout))
+            ]))
+        # full attention in the last layer
+        attn_class = partial(Attention, causal = causal)
+        attn = attn_class(dim, seq_len = seq_len, heads = heads, dim_head = dim_head, attn_dropout = attn_dropout, resid_dropout = resid_dropout)
+        layers.append(nn.ModuleList([
+            PreNorm(dim, attn),
+            PreNorm(dim, FeedForward(dim, dropout = ff_dropout))
+        ]))
+        self.layers = SequentialSequence(layers)
+        self.embd_drop = nn.Dropout(embd_dropout)
+    def forward(self, x):
+        x = self.embd_drop(x)
+        return self.layers(x)

text2punks/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# os
+from pathlib import Path
+# torch
+import torch
+import torchvision.transforms.functional as F
+from einops import repeat
+# Text2Punks and Tokenizer
+from text2punks.text2punk import Text2Punks, CLIP
+from text2punks.tokenizer import txt_tokenizer
+# select device
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# load decoder
+codebook = torch.load('./text2punks/data/codebook.pt')
+# helper fns
+def exists(val):
+    return val is not None
+def to_pil_image(image_tensor):
+    return F.to_pil_image(image_tensor)
+def model_loader(text2punk_path, clip_path):
+    # load pre-trained TEXT2PUNKS model
+    text2punk_path = Path(text2punk_path)
+    assert text2punk_path.exists(), 'trained Text2Punks must exist'
+    load_obj = torch.load(str(text2punk_path), map_location=torch.device(device))
+    text2punks_params, weights = load_obj.pop('hparams'), load_obj.pop('weights')
+    text2punk = Text2Punks(**text2punks_params).to(device)
+    text2punk.load_state_dict(weights)
+    # load pre-trained CLIP model
+    clip_path = Path(clip_path)
+    assert clip_path.exists(), 'trained CLIP must exist'
+    load_obj = torch.load(str(clip_path), map_location=torch.device(device))
+    clip_params, weights = load_obj.pop('hparams'), load_obj.pop('weights')
+    clip = CLIP(**clip_params).to(device)
+    clip.load_state_dict(weights)
+    return text2punk, clip
+def generate_image(prompt_text, top_k, temperature, num_images, batch_size, top_prediction, text2punk_model, clip_model, codebook=codebook):
+    text = txt_tokenizer.tokenize(prompt_text, text2punk_model.text_seq_len, truncate_text=True).to(device)
+    text = repeat(text, '() n -> b n', b = num_images)
+    img_outputs = []
+    score_outputs = []
+    for text_chunk in text.split(batch_size):
+        images, scores = text2punk_model.generate_images(text_chunk, codebook.to(device), clip = clip_model, filter_thres = top_k, temperature = temperature)
+        img_outputs.append(images)
+        score_outputs.append(scores)
+    img_outputs = torch.cat(img_outputs)
+    score_outputs = torch.cat(score_outputs)
+    similarity = score_outputs.softmax(dim=-1)
+    values, indices = similarity.topk(top_prediction)
+    img_outputs = img_outputs[indices]
+    score_outputs = score_outputs[indices]
+    return img_outputs, score_outputs