Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Nov 29, 2021

Commit

24d30c9

•

2 Parent(s): 6016fc0 74974be

Merge pull request #108 from borisdayma/feat-inf

Browse files

Files changed (7) hide show

app/app.py +32 -44
dev/inference/README.md +0 -1
dev/inference/wandb-examples-from-backend.py +0 -76
dev/inference/wandb-examples.py +0 -163
{dev → tools}/inference/inference_pipeline.ipynb +0 -0
dev/inference/wandb-backend.ipynb → tools/inference/log_inference_samples.ipynb +34 -73
{dev → tools}/inference/samples.txt +16 -3

app/app.py CHANGED Viewed

@@ -2,31 +2,10 @@
 # coding: utf-8
 from dalle_mini.backend import ServiceError, get_images_from_backend
 import streamlit as st
-# streamlit.session_state is not available in Huggingface spaces.
-# Session state hack https://huggingface.slack.com/archives/C025LJDP962/p1626527367443200?thread_ts=1626525999.440500&cid=C025LJDP962
-from streamlit.report_thread import get_report_ctx
-def query_cache(q_emb=None):
-    ctx = get_report_ctx()
-    session_id = ctx.session_id
-    session = st.server.server.Server.get_current()._get_session_info(session_id).session
-    if not hasattr(session, "_query_state"):
-        setattr(session, "_query_state", q_emb)
-    if q_emb:
-        session._query_state = q_emb
-    return session._query_state
-def set_run_again(state):
-    query_cache(state)
-def should_run_again():
-    state = query_cache()
-    return state if state is not None else False
-st.sidebar.markdown("""
 <style>
 .aligncenter {
     text-align: center;
@@ -35,8 +14,11 @@ st.sidebar.markdown("""
 <p class="aligncenter">
     <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/img/logo.png"/>
 </p>
-""", unsafe_allow_html=True)
-st.sidebar.markdown("""
 ___
 <p style='text-align: center'>
 DALL·E mini is an AI model that generates images from any prompt you give!
@@ -47,21 +29,20 @@ Created by Boris Dayma et al. 2021
 <br/>
 <a href="https://github.com/borisdayma/dalle-mini" target="_blank">GitHub</a> | <a href="https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA" target="_blank">Project Report</a>
 </p>
-        """, unsafe_allow_html=True)
-st.header('DALL·E mini')
-st.subheader('Generate images from text')
 prompt = st.text_input("What do you want to see?")
-test = st.empty()
 DEBUG = False
-if prompt != "" or (should_run_again and prompt != ""):
     container = st.empty()
-    # The following mimics `streamlit.info()`.
-    # I tried to get the secondary background color using `components.streamlit.config.get_options_for_section("theme")["secondaryBackgroundColor"]`
-    # but it returns None.
-    container.markdown(f"""
         <style> p {{ margin:0 }} div {{ margin:0 }} </style>
         <div data-stale="false" class="element-container css-1e5imcs e1tzin5v1">
         <div class="stAlert">
@@ -78,32 +59,39 @@ if prompt != "" or (should_run_again and prompt != ""):
         </div>
         </div>
         <small><i>Predictions may take up to 40s under high load. Please stand by.</i></small>
-    """, unsafe_allow_html=True)
     try:
         backend_url = st.secrets["BACKEND_SERVER"]
         print(f"Getting selections: {prompt}")
         selected = get_images_from_backend(prompt, backend_url)
-        cols = st.columns(4)
         for i, img in enumerate(selected):
-            cols[i%4].image(img)
         container.markdown(f"**{prompt}**")
-        set_run_again(st.button('Again!', key='again_button'))
     except ServiceError as error:
         container.text(f"Service unavailable, status: {error.status_code}")
     except KeyError:
         if DEBUG:
-            container.markdown("""
             **Error: BACKEND_SERVER unset**
             Please, create a file called `.streamlit/secrets.toml` inside the app's folder and include a line to configure the server URL:
             ```
             BACKEND_SERVER="<server url>"
             ```
-            """)
         else:
-            container.markdown('Error -5, please try again or [report it](mailto:pcuenca-dalle@guenever.net).')

 # coding: utf-8
 from dalle_mini.backend import ServiceError, get_images_from_backend
 import streamlit as st
+st.sidebar.markdown(
+    """
 <style>
 .aligncenter {
     text-align: center;
 <p class="aligncenter">
     <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/img/logo.png"/>
 </p>
+""",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    """
 ___
 <p style='text-align: center'>
 DALL·E mini is an AI model that generates images from any prompt you give!
 <br/>
 <a href="https://github.com/borisdayma/dalle-mini" target="_blank">GitHub</a> | <a href="https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA" target="_blank">Project Report</a>
 </p>
+        """,
+    unsafe_allow_html=True,
+)
+st.header("DALL·E mini")
+st.subheader("Generate images from text")
 prompt = st.text_input("What do you want to see?")
 DEBUG = False
+if prompt != "":
     container = st.empty()
+    container.markdown(
+        f"""
         <style> p {{ margin:0 }} div {{ margin:0 }} </style>
         <div data-stale="false" class="element-container css-1e5imcs e1tzin5v1">
         <div class="stAlert">
         </div>
         </div>
         <small><i>Predictions may take up to 40s under high load. Please stand by.</i></small>
+    """,
+        unsafe_allow_html=True,
+    )
     try:
         backend_url = st.secrets["BACKEND_SERVER"]
         print(f"Getting selections: {prompt}")
         selected = get_images_from_backend(prompt, backend_url)
+        margin = 0.1  # for better position of zoom in arrow
+        n_columns = 3
+        cols = st.columns([1] + [margin, 1] * (n_columns - 1))
         for i, img in enumerate(selected):
+            cols[(i % n_columns) * 2].image(img)
         container.markdown(f"**{prompt}**")
+        st.button("Again!", key="again_button")
     except ServiceError as error:
         container.text(f"Service unavailable, status: {error.status_code}")
     except KeyError:
         if DEBUG:
+            container.markdown(
+                """
             **Error: BACKEND_SERVER unset**
             Please, create a file called `.streamlit/secrets.toml` inside the app's folder and include a line to configure the server URL:
             ```
             BACKEND_SERVER="<server url>"
             ```
+            """
+            )
         else:
+            container.markdown(
+                "Error -5, please try again or [report it](mailto:pcuenca-dalle@guenever.net)."
+            )

dev/inference/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- Scripts to generate predictions for assessment and reporting.

dev/inference/wandb-examples-from-backend.py DELETED Viewed

@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-from PIL import Image, ImageDraw, ImageFont
-import wandb
-import os
-from dalle_mini.backend import ServiceError, get_images_from_backend
-from dalle_mini.helpers import captioned_strip
-os.environ["WANDB_SILENT"] = "true"
-os.environ["WANDB_CONSOLE"] = "off"
-def log_to_wandb(prompts):
-    try:
-        backend_url = os.environ["BACKEND_SERVER"]
-        for _ in range(1):
-            for prompt in prompts:
-                print(f"Getting selections for: {prompt}")
-                # make a separate run per prompt
-                with wandb.init(
-                    entity='wandb',
-                    project='hf-flax-dalle-mini',
-                    job_type='predictions',# tags=['openai'],
-                    config={'prompt': prompt}
-                ):
-                    imgs = []
-                    selected = get_images_from_backend(prompt, backend_url)
-                    strip = captioned_strip(selected, prompt)
-                    imgs.append(wandb.Image(strip))
-                    wandb.log({"images": imgs})
-    except ServiceError as error:
-        print(f"Service unavailable, status: {error.status_code}")
-    except KeyError:
-        print("Error: BACKEND_SERVER unset")
-prompts = [
-    # "white snow covered mountain under blue sky during daytime",
-    # "aerial view of beach during daytime",
-    # "aerial view of beach at night",
-    # "a farmhouse surrounded by beautiful flowers",
-    # "an armchair in the shape of an avocado",
-    # "young woman riding her bike trough a forest",
-    # "a unicorn is passing by a rainbow in a field of flowers",
-    # "illustration of a baby shark swimming around corals",
-    # "painting of an oniric forest glade surrounded by tall trees",
-    # "sunset over green mountains",
-    # "a forest glade surrounded by tall trees in a sunny Spring morning",
-    # "fishing village under the moonlight in a serene sunset",
-    # "cartoon of a carrot with big eyes",
-    # "still life in the style of Kandinsky",
-    # "still life in the style of Picasso",
-    # "a graphite sketch of a gothic cathedral",
-    # "a graphite sketch of Elon Musk",
-    # "a watercolor pond with green leaves and yellow flowers",
-    # "a logo of a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps",
-    # "happy celebration in a small village in Africa",
-    # "a logo of an armchair in the shape of an avocado"
-    # "Pele and Maradona in a hypothetical match",
-    # "Mohammed Ali and Mike Tyson in a hypothetical match",
-    # "a storefront that has the word 'openai' written on it",
-    # "a pentagonal green clock",
-    # "a collection of glasses is sitting on a table",
-    # "a small red block sitting on a large green block",
-    # "an extreme close-up view of a capybara sitting in a field",
-    # "a cross-section view of a walnut",
-    # "a professional high-quality emoji of a lovestruck cup of boba",
-    # "a photo of san francisco's golden gate bridge",
-    # "an illustration of a baby daikon radish in a tutu walking a dog",
-    # "a picture of the Eiffel tower on the Moon",
-    # "a colorful stairway to heaven",
-    "this is a detailed high-resolution scan of a human brain"
-]
-for _ in range(1):
-    log_to_wandb(prompts)

dev/inference/wandb-examples.py DELETED Viewed

@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-import random
-import jax
-from flax.training.common_utils import shard
-from flax.jax_utils import replicate, unreplicate
-from transformers.models.bart.modeling_flax_bart import *
-from transformers import BartTokenizer, FlaxBartForConditionalGeneration
-import os
-from PIL import Image
-import numpy as np
-import matplotlib.pyplot as plt
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as TF
-from torchvision.transforms import InterpolationMode
-from dalle_mini.model import CustomFlaxBartForConditionalGeneration
-from vqgan_jax.modeling_flax_vqgan import VQModel
-# ## CLIP Scoring
-from transformers import CLIPProcessor, FlaxCLIPModel
-import wandb
-import os
-from dalle_mini.helpers import captioned_strip
-os.environ["WANDB_SILENT"] = "true"
-os.environ["WANDB_CONSOLE"] = "off"
-# TODO: used for legacy support
-BASE_MODEL = 'facebook/bart-large-cnn'
-# set id to None so our latest images don't get overwritten
-id = None
-run = wandb.init(id=id,
-        entity='wandb',
-        project="hf-flax-dalle-mini",
-        job_type="predictions",
-        resume="allow"
-)
-artifact = run.use_artifact('wandb/hf-flax-dalle-mini/model-4oh3u7ca:latest', type='bart_model')
-artifact_dir = artifact.download()
-# create our model
-model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)
-# TODO: legacy support (earlier models)
-tokenizer = BartTokenizer.from_pretrained(BASE_MODEL)
-model.config.force_bos_token_to_be_generated = False
-model.config.forced_bos_token_id = None
-model.config.forced_eos_token_id = None
-vqgan = VQModel.from_pretrained("flax-community/vqgan_f16_16384")
-def custom_to_pil(x):
-    x = np.clip(x, 0., 1.)
-    x = (255*x).astype(np.uint8)
-    x = Image.fromarray(x)
-    if not x.mode == "RGB":
-        x = x.convert("RGB")
-    return x
-def generate(input, rng, params):
-    return model.generate(
-        **input,
-        max_length=257,
-        num_beams=1,
-        do_sample=True,
-        prng_key=rng,
-        eos_token_id=50000,
-        pad_token_id=50000,
-        params=params,
-    )
-def get_images(indices, params):
-    return vqgan.decode_code(indices, params=params)
-def plot_images(images):
-    fig = plt.figure(figsize=(40, 20))
-    columns = 4
-    rows = 2
-    plt.subplots_adjust(hspace=0, wspace=0)
-    for i in range(1, columns*rows +1):
-        fig.add_subplot(rows, columns, i)
-        plt.imshow(images[i-1])
-    plt.gca().axes.get_yaxis().set_visible(False)
-    plt.show()
-def stack_reconstructions(images):
-    w, h = images[0].size[0], images[0].size[1]
-    img = Image.new("RGB", (len(images)*w, h))
-    for i, img_ in enumerate(images):
-        img.paste(img_, (i*w,0))
-    return img
-p_generate = jax.pmap(generate, "batch")
-p_get_images = jax.pmap(get_images, "batch")
-bart_params = replicate(model.params)
-vqgan_params = replicate(vqgan.params)
-clip = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-def hallucinate(prompt, num_images=64):
-    prompt = [prompt] * jax.device_count()
-    inputs = tokenizer(prompt, return_tensors='jax', padding="max_length", truncation=True, max_length=128).data
-    inputs = shard(inputs)
-    all_images = []
-    for i in range(num_images // jax.device_count()):
-        key = random.randint(0, 1e7)
-        rng = jax.random.PRNGKey(key)
-        rngs = jax.random.split(rng, jax.local_device_count())
-        indices = p_generate(inputs, rngs, bart_params).sequences
-        indices = indices[:, :, 1:]
-        images = p_get_images(indices, vqgan_params)
-        images = np.squeeze(np.asarray(images), 1)
-        for image in images:
-            all_images.append(custom_to_pil(image))
-    return all_images
-def clip_top_k(prompt, images, k=8):
-    inputs = processor(text=prompt, images=images, return_tensors="np", padding=True)
-    # FIXME: image should be resized and normalized prior to being processed by CLIP
-    outputs = clip(**inputs)
-    logits = outputs.logits_per_text
-    scores = np.array(logits[0]).argsort()[-k:][::-1]
-    return [images[score] for score in scores]
-def log_to_wandb(prompts):
-    strips = []
-    for prompt in prompts:
-        print(f"Generating candidates for: {prompt}")
-        images = hallucinate(prompt, num_images=32)
-        selected = clip_top_k(prompt, images, k=8)
-        strip = captioned_strip(selected, prompt)
-        strips.append(wandb.Image(strip))
-    wandb.log({"images": strips})
-prompts = prompts = [
-    "white snow covered mountain under blue sky during daytime",
-    "aerial view of beach during daytime",
-    "aerial view of beach at night",
-    "an armchair in the shape of an avocado",
-    "young woman riding her bike trough a forest",
-    "rice fields by the mediterranean coast",
-    "white houses on the hill of a greek coastline",
-    "illustration of a shark with a baby shark",
-]
-log_to_wandb(prompts)

{dev → tools}/inference/inference_pipeline.ipynb RENAMED Viewed

File without changes

dev/inference/wandb-backend.ipynb → tools/inference/log_inference_samples.ipynb RENAMED Viewed

@@ -24,25 +24,6 @@
     "from dalle_mini.text import TextNormalizer"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "23e00271-941c-4e1b-b6a9-107a1b77324d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "run_ids = ['3kaut6e8']\n",
-    "# Alamy - 3kaut6e8\n",
-    "# YFCC - to do\n",
-    "# HF spaces - 4oh3u7ca\n",
-    "ENTITY, PROJECT = 'wandb', 'hf-flax-dalle-mini'\n",
-    "VQGAN_REPO, VQGAN_COMMIT_ID = 'dalle-mini/vqgan_imagenet_f16_16384', None\n",
-    "normalize_text = False\n",
-    "latest_only = True   # log only latest or all versions\n",
-    "suffix = ''           # mainly for duplicate inference runs with a deleted version\n",
-    "add_clip_32 = False"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -50,13 +31,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run_ids = ['2u5lk3uw']\n",
-    "# poorly shuffled 1nj161cl\n",
-    "# well shuffled he9rrc3q\n",
-    "# non normalized 1fwxpyfh ! requires changing normalize_text\n",
     "ENTITY, PROJECT = 'dalle-mini', 'dalle-mini'  # used only for training run\n",
-    "VQGAN_REPO, VQGAN_COMMIT_ID = 'dalle-mini/vqgan_imagenet_f16_16384', None\n",
-    "normalize_text = True\n",
     "latest_only = True    # log only latest or all versions\n",
     "suffix = ''           # mainly for duplicate inference runs with a deleted version\n",
     "add_clip_32 = False"
@@ -85,7 +62,7 @@
     "batch_size = 8\n",
     "num_images = 128\n",
     "top_k = 8\n",
-    "text_normalizer = TextNormalizer() if normalize_text else None\n",
     "padding_item = 'NONE'\n",
     "seed = random.randint(0, 2**32-1)\n",
     "key = jax.random.PRNGKey(seed)\n",
@@ -100,11 +77,12 @@
    "outputs": [],
    "source": [
     "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
-    "clip = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
-    "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
-    "clip_params = replicate(clip.params)\n",
     "vqgan_params = replicate(vqgan.params)\n",
     "\n",
     "if add_clip_32:\n",
     "    clip32 = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
     "    processor32 = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
@@ -123,8 +101,8 @@
     "    return vqgan.decode_code(indices, params=params)\n",
     "\n",
     "@partial(jax.pmap, axis_name=\"batch\")\n",
-    "def p_clip(inputs, params):\n",
-    "    logits = clip(params=params, **inputs).logits_per_image\n",
     "    return logits\n",
     "\n",
     "if add_clip_32:\n",
@@ -229,7 +207,7 @@
    "outputs": [],
    "source": [
     "run_id = run_ids[0]\n",
-    "# TODO: turn everything into a class"
    ]
   },
   {
@@ -248,10 +226,8 @@
     "for artifact in artifact_versions:\n",
     "    print(f'Processing artifact: {artifact.name}')\n",
     "    version = int(artifact.version[1:])\n",
-    "    results = []\n",
-    "    if add_clip_32:\n",
-    "        results32 = []\n",
-    "    columns = ['Caption'] + [f'Image {i+1}' for i in range(top_k)] + [f'Score {i+1}' for i in range(top_k)]\n",
     "    \n",
     "    if latest_only:\n",
     "        assert last_inference_version is None or version > last_inference_version\n",
@@ -288,7 +264,7 @@
     "\n",
     "        # process one batch of captions\n",
     "        for batch in tqdm(samples):\n",
-    "            processed_prompts = [text_normalizer(x) for x in batch] if normalize_text else list(batch)\n",
     "\n",
     "            # repeat the prompts to distribute over each device and tokenize\n",
     "            processed_prompts = processed_prompts * jax.device_count()\n",
@@ -297,7 +273,7 @@
     "\n",
     "            # generate images\n",
     "            images = []\n",
-    "            pbar = tqdm(range(num_images // jax.device_count()), desc='Generating Images', leave=None)\n",
     "            for i in pbar:\n",
     "                key, subkey = jax.random.split(key)\n",
     "                encoded_images = p_generate(tokenized_prompt, shard_prng_key(subkey), model_params)\n",
@@ -307,34 +283,13 @@
     "                for img in decoded_images:\n",
     "                    images.append(Image.fromarray(np.asarray(img * 255, dtype=np.uint8)))\n",
     "\n",
-    "            # get clip scores\n",
-    "            pbar.set_description('Calculating CLIP scores')\n",
-    "            clip_inputs = processor(text=batch, images=images, return_tensors='np', padding='max_length', max_length=77, truncation=True).data\n",
-    "            # each shard will have one prompt, images need to be reorganized to be associated to the correct shard\n",
-    "            images_per_prompt_indices = np.asarray(range(0, len(images), batch_size))\n",
-    "            clip_inputs['pixel_values'] = jnp.concatenate(list(clip_inputs['pixel_values'][images_per_prompt_indices + i] for i in range(batch_size)))\n",
-    "            clip_inputs = shard(clip_inputs)\n",
-    "            logits = p_clip(clip_inputs, clip_params)\n",
-    "            logits = logits.reshape(-1, num_images)\n",
-    "            top_scores = logits.argsort()[:, -top_k:][..., ::-1]\n",
-    "            logits = jax.device_get(logits)\n",
-    "            # add to results table\n",
-    "            for i, (idx, scores, sample) in enumerate(zip(top_scores, logits, batch)):\n",
-    "                if sample == padding_item: continue\n",
-    "                cur_images = [images[x] for x in images_per_prompt_indices + i]\n",
-    "                top_images = [wandb.Image(cur_images[x]) for x in idx]\n",
-    "                top_scores = [scores[x] for x in idx]\n",
-    "                results.append([sample] + top_images + top_scores)\n",
-    "                \n",
-    "            # get clip 32 scores - TODO: this should be refactored as it is same code as above\n",
-    "            if add_clip_32:\n",
-    "                print('Calculating CLIP 32 scores')\n",
-    "                clip_inputs = processor32(text=batch, images=images, return_tensors='np', padding='max_length', max_length=77, truncation=True).data\n",
     "                # each shard will have one prompt, images need to be reorganized to be associated to the correct shard\n",
     "                images_per_prompt_indices = np.asarray(range(0, len(images), batch_size))\n",
     "                clip_inputs['pixel_values'] = jnp.concatenate(list(clip_inputs['pixel_values'][images_per_prompt_indices + i] for i in range(batch_size)))\n",
     "                clip_inputs = shard(clip_inputs)\n",
-    "                logits = p_clip32(clip_inputs, clip32_params)\n",
     "                logits = logits.reshape(-1, num_images)\n",
     "                top_scores = logits.argsort()[:, -top_k:][..., ::-1]\n",
     "                logits = jax.device_get(logits)\n",
@@ -342,13 +297,24 @@
     "                for i, (idx, scores, sample) in enumerate(zip(top_scores, logits, batch)):\n",
     "                    if sample == padding_item: continue\n",
     "                    cur_images = [images[x] for x in images_per_prompt_indices + i]\n",
-    "                    top_images = [wandb.Image(cur_images[x]) for x in idx]\n",
-    "                    top_scores = [scores[x] for x in idx]\n",
-    "                    results32.append([sample] + top_images + top_scores)\n",
     "            pbar.close()\n",
     "\n",
     "    # log results\n",
-    "    table = wandb.Table(columns=columns, data=results)\n",
     "    run.log({'Samples': table, 'version': version})\n",
     "    wandb.finish()\n",
     "    \n",
@@ -363,15 +329,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4e4c7d0c-2848-4f88-b967-82fd571534f1",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# TODO: not implemented\n",
-    "def log_runs(runs):\n",
-    "    for run in tqdm(runs):\n",
-    "        log_run(run)"
-   ]
   }
  ],
  "metadata": {

     "from dalle_mini.text import TextNormalizer"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "run_ids = ['63otg87g']\n",
     "ENTITY, PROJECT = 'dalle-mini', 'dalle-mini'  # used only for training run\n",
+    "VQGAN_REPO, VQGAN_COMMIT_ID = 'dalle-mini/vqgan_imagenet_f16_16384', 'e93a26e7707683d349bf5d5c41c5b0ef69b677a9'\n",
     "latest_only = True    # log only latest or all versions\n",
     "suffix = ''           # mainly for duplicate inference runs with a deleted version\n",
     "add_clip_32 = False"
     "batch_size = 8\n",
     "num_images = 128\n",
     "top_k = 8\n",
+    "text_normalizer = TextNormalizer()\n",
     "padding_item = 'NONE'\n",
     "seed = random.randint(0, 2**32-1)\n",
     "key = jax.random.PRNGKey(seed)\n",
    "outputs": [],
    "source": [
     "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
     "vqgan_params = replicate(vqgan.params)\n",
     "\n",
+    "clip16 = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
+    "processor16 = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
+    "clip16_params = replicate(clip16.params)\n",
+    "\n",
     "if add_clip_32:\n",
     "    clip32 = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
     "    processor32 = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
     "    return vqgan.decode_code(indices, params=params)\n",
     "\n",
     "@partial(jax.pmap, axis_name=\"batch\")\n",
+    "def p_clip16(inputs, params):\n",
+    "    logits = clip16(params=params, **inputs).logits_per_image\n",
     "    return logits\n",
     "\n",
     "if add_clip_32:\n",
    "outputs": [],
    "source": [
     "run_id = run_ids[0]\n",
+    "# TODO: loop over runs"
    ]
   },
   {
     "for artifact in artifact_versions:\n",
     "    print(f'Processing artifact: {artifact.name}')\n",
     "    version = int(artifact.version[1:])\n",
+    "    results16, results32 = [], []\n",
+    "    columns = ['Caption'] + [f'Image {i+1}' for i in range(top_k)]\n",
     "    \n",
     "    if latest_only:\n",
     "        assert last_inference_version is None or version > last_inference_version\n",
     "\n",
     "        # process one batch of captions\n",
     "        for batch in tqdm(samples):\n",
+    "            processed_prompts = [text_normalizer(x) for x in batch] if model.config.normalize_text else list(batch)\n",
     "\n",
     "            # repeat the prompts to distribute over each device and tokenize\n",
     "            processed_prompts = processed_prompts * jax.device_count()\n",
     "\n",
     "            # generate images\n",
     "            images = []\n",
+    "            pbar = tqdm(range(num_images // jax.device_count()), desc='Generating Images', leave=True)\n",
     "            for i in pbar:\n",
     "                key, subkey = jax.random.split(key)\n",
     "                encoded_images = p_generate(tokenized_prompt, shard_prng_key(subkey), model_params)\n",
     "                for img in decoded_images:\n",
     "                    images.append(Image.fromarray(np.asarray(img * 255, dtype=np.uint8)))\n",
     "\n",
+    "            def add_clip_results(results, processor, p_clip, clip_params):  \n",
+    "                clip_inputs = processor(text=batch, images=images, return_tensors='np', padding='max_length', max_length=77, truncation=True).data\n",
     "                # each shard will have one prompt, images need to be reorganized to be associated to the correct shard\n",
     "                images_per_prompt_indices = np.asarray(range(0, len(images), batch_size))\n",
     "                clip_inputs['pixel_values'] = jnp.concatenate(list(clip_inputs['pixel_values'][images_per_prompt_indices + i] for i in range(batch_size)))\n",
     "                clip_inputs = shard(clip_inputs)\n",
+    "                logits = p_clip(clip_inputs, clip_params)\n",
     "                logits = logits.reshape(-1, num_images)\n",
     "                top_scores = logits.argsort()[:, -top_k:][..., ::-1]\n",
     "                logits = jax.device_get(logits)\n",
     "                for i, (idx, scores, sample) in enumerate(zip(top_scores, logits, batch)):\n",
     "                    if sample == padding_item: continue\n",
     "                    cur_images = [images[x] for x in images_per_prompt_indices + i]\n",
+    "                    top_images = [wandb.Image(cur_images[x], caption=f'Score: {scores[x]:.2f}') for x in idx]\n",
+    "                    results.append([sample] + top_images)\n",
+    "                    \n",
+    "            # get clip scores\n",
+    "            pbar.set_description('Calculating CLIP 16 scores')\n",
+    "            add_clip_results(results16, processor16, p_clip16, clip16_params)\n",
+    "                \n",
+    "            # get clip 32 scores\n",
+    "            if add_clip_32:\n",
+    "                pbar.set_description('Calculating CLIP 32 scores')\n",
+    "                add_clip_results(results32, processor32, p_clip32, clip32_params)\n",
+    "\n",
     "            pbar.close()\n",
     "\n",
+    "                \n",
+    "\n",
     "    # log results\n",
+    "    table = wandb.Table(columns=columns, data=results16)\n",
     "    run.log({'Samples': table, 'version': version})\n",
     "    wandb.finish()\n",
     "    \n",
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "415d3f54-7226-43de-9eea-4283a948dc93",
    "metadata": {},
    "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

{dev → tools}/inference/samples.txt RENAMED Viewed

@@ -32,7 +32,9 @@ illustration of an astronaut in a space suit playing guitar
 a clown wearing a spacesuit floating in space
 a dog playing with a ball
 a cat sits on top of an alligator
 a rat holding a red lightsaber in a white background
 A unicorn is passing by a rainbow in a field of flowers
 an elephant made of carrots
 an elephant on a unicycle during a circus
@@ -40,6 +42,7 @@ photography of a penguin watching television
 a penguin is walking on the Moon, Earth is in the background
 a penguin standing on a tower of books holds onto a rope from a helicopter
 rat wearing a crown
 looking into the sky, 10 airplanes are seen overhead
 shelves filled with books and alchemy potion bottles
 this is a detailed high-resolution scan of a human brain
@@ -61,7 +64,6 @@ a cartoon of a superhero bear
 an illustration of a cute skeleton wearing a blue hoodie
 illustration of a baby shark swimming around corals
 an illustration of an avocado in a beanie riding a motorcycle
-Cartoon of a carrot with big eyes
 logo of a robot wearing glasses and reading a book
 illustration of a cactus lifting weigths
 logo of a cactus lifting weights
@@ -70,11 +72,12 @@ a skeleton with the shape of a spider
 a collection of glasses is sitting on a table
 a painting of a capybara sitting on a mountain during fall in surrealist style
 a pentagonal green clock
-a pixel art illustration of an eagle sitting in a field in the afternoon
 a small red block sitting on a large green block
 a storefront that has the word 'openai' written on it
 a tatoo of a black broccoli
 a variety of clocks is sitting on a table
 an emoji of a baby fox wearing a blue hat, green gloves, red shirt, and yellow pants
 an emoji of a baby penguin wearing a blue hat, blue gloves, red shirt, and green pants
 an extreme close-up view of a capybara sitting in a field
@@ -86,10 +89,11 @@ urinals are lined up in a jungle
 a muscular banana sitting upright on a bench smoking watching a banana on television, high definition photography
 a human face
 a person is holding a phone and a waterbottle, running a marathon
 Young woman riding her bike through the forest
 the best soccer team of the world
-the best basketball team of the world
 the best football team of the world
 happy, happiness
 sad, sadness
 the representation of infinity
@@ -105,3 +109,12 @@ an avocado armchair flying into space
 a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps
 an illustration of an avocado in a christmas sweater staring at its reflection in a mirror
 illustration of an avocado armchair getting married to a pineapple

 a clown wearing a spacesuit floating in space
 a dog playing with a ball
 a cat sits on top of an alligator
+a very cute cat laying by a big bike
 a rat holding a red lightsaber in a white background
+a very cute giraffe making a funny face
 A unicorn is passing by a rainbow in a field of flowers
 an elephant made of carrots
 an elephant on a unicycle during a circus
 a penguin is walking on the Moon, Earth is in the background
 a penguin standing on a tower of books holds onto a rope from a helicopter
 rat wearing a crown
+Cartoon of a carrot with big eyes
 looking into the sky, 10 airplanes are seen overhead
 shelves filled with books and alchemy potion bottles
 this is a detailed high-resolution scan of a human brain
 an illustration of a cute skeleton wearing a blue hoodie
 illustration of a baby shark swimming around corals
 an illustration of an avocado in a beanie riding a motorcycle
 logo of a robot wearing glasses and reading a book
 illustration of a cactus lifting weigths
 logo of a cactus lifting weights
 a collection of glasses is sitting on a table
 a painting of a capybara sitting on a mountain during fall in surrealist style
 a pentagonal green clock
 a small red block sitting on a large green block
 a storefront that has the word 'openai' written on it
 a tatoo of a black broccoli
 a variety of clocks is sitting on a table
+a table has a train model on it with other cars and things
+a pixel art illustration of an eagle sitting in a field in the afternoon
 an emoji of a baby fox wearing a blue hat, green gloves, red shirt, and yellow pants
 an emoji of a baby penguin wearing a blue hat, blue gloves, red shirt, and green pants
 an extreme close-up view of a capybara sitting in a field
 a muscular banana sitting upright on a bench smoking watching a banana on television, high definition photography
 a human face
 a person is holding a phone and a waterbottle, running a marathon
+a child eating a birthday cake near some balloons
 Young woman riding her bike through the forest
 the best soccer team of the world
 the best football team of the world
+the best basketball team of the world
 happy, happiness
 sad, sadness
 the representation of infinity
 a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps
 an illustration of an avocado in a christmas sweater staring at its reflection in a mirror
 illustration of an avocado armchair getting married to a pineapple
+half human half cat
+half human half dog
+half human half pen
+half human half garbage
+half human half avocado
+half human half Eiffel tower
+a propaganda poster for transhumanism
+a propaganda poster for building a space elevator
+a beautiful epic fantasy painting of a space elevator