In [None]:
import csv
import tempfile
from functools import partial
import random
import numpy as np
from PIL import Image
import jax
import jax.numpy as jnp
from flax.training.common_utils import shard, shard_prng_key
from flax.jax_utils import replicate
import wandb
from dalle_mini.model import CustomFlaxBartForConditionalGeneration
from vqgan_jax.modeling_flax_vqgan import VQModel
from transformers import BartTokenizer, CLIPProcessor, FlaxCLIPModel
from dalle_mini.text import TextNormalizer

In [None]:
wandb_runs = ['rjf3rycy']
VQGAN_REPO, VQGAN_COMMIT_ID = 'dalle-mini/vqgan_imagenet_f16_16384', None
normalize_text = True

In [None]:
batch_size = 8
num_images = 128
top_k = 8
text_normalizer = TextNormalizer() if normalize_text else None

In [None]:
seed = random.randint(0, 2**32-1)
key = jax.random.PRNGKey(seed)

In [None]:
vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)
clip = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
clip_params = replicate(clip.params)
vqgan_params = replicate(vqgan.params)

In [None]:
with open('samples.csv', newline='', encoding='utf8') as f:
 reader = csv.DictReader(f)
 samples = []
 for row in reader:
 samples.append(row)
 # make list multiple of batch_size by adding "empty"
 samples_to_add = [{'Caption':'empty', 'Theme':'empty'}] * (-len(samples) % batch_size)
 samples.extend(samples_to_add)
 # reshape
 samples = [samples[i:i+batch_size] for i in range(0, len(samples), batch_size)]

In [None]:
len(samples)

In [None]:
samples[-1]

In [None]:
api = wandb.Api()

In [None]:
# TODO: iterate on runs
wandb_run = wandb_runs[0]
functions_pmapped = False

In [None]:
try:
 versions = api.artifact_versions(type_name='bart_model', name=f'dalle-mini/dalle-mini/model-{wandb_run}', per_page=10000)
except:
 versions = []

In [None]:
versions, len(versions)

In [None]:
versions[0].version

In [None]:
artifact = versions[0]

In [None]:
version = int(artifact.version[1:])

In [None]:
version

In [None]:
# retrieve training run
training_run = api.run(f'dalle-mini/dalle-mini/{wandb_run}')
config = training_run.config

In [None]:
# see summary metrics
training_run.summary

In [None]:
# retrieve inference run details
def get_last_version_inference(run_id):
 try:
 inference_run = api.run(f'dalle-mini/dalle-mini/inference-{run_id}')
 return inference_run.summary.get('_step', None)
 except:
 return None

In [None]:
last_version_inference = get_last_version_inference(wandb_run)

In [None]:
if last_version_inference is None:
 assert version == 0
elif last_version_inference >= version:
 print(f'Version {version} has already been logged')
else:
 assert version == last_version_inference + 1

In [None]:
run = wandb.init(job_type='inference', config=config, id=f'inference-{wandb_run}', resume='allow')

In [None]:
tmp_f.cleanup
tmp_f = tempfile.TemporaryDirectory()
tmp = tmp_f.name
#TODO: use context manager

In [None]:
# remove tmp
tmp_f.cleanup()

In [None]:
artifact = run.use_artifact(artifact)

In [None]:
# only download required files
for f in ['config.json', 'flax_model.msgpack', 'merges.txt', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.json']:
 artifact.get_path(f).download(tmp)

In [None]:
# we verify all the files are present
from pathlib import Path
list(Path(tmp).glob('*'))

In [None]:
tokenizer = BartTokenizer.from_pretrained(tmp)
model = CustomFlaxBartForConditionalGeneration.from_pretrained(tmp)

In [None]:
model_params = replicate(model.params)

In [None]:
# function to generate encoded images
# we should generate this function only once per run
if not functions_pmapped:
 @partial(jax.pmap, axis_name="batch")
 def p_generate(tokenized_prompt, key, params):
 return model.generate(
 **tokenized_prompt,
 do_sample=True,
 num_beams=1,
 prng_key=key,
 params=params
 )
 
 @partial(jax.pmap, axis_name="batch")
 def p_decode(indices, params):
 return vqgan.decode_code(indices, params=params)
 
 @partial(jax.pmap, axis_name="batch")
 def p_clip(inputs):
 logits = clip(**inputs).logits_per_image
 return logits
 scores = jax.nn.softmax(logits, axis=0).squeeze() 
 
 functions_pmapped = False

In [None]:
# TODO: loop over samples
batch = samples[0]
prompts = [x['Caption'] for x in batch]
processed_prompts = [text_normalizer(x) for x in prompts] if normalize_text else prompts

In [None]:
processed_prompts

In [None]:
repeated_prompts = processed_prompts * jax.device_count()

In [None]:
tokenized_prompt = tokenizer(repeated_prompts, return_tensors='jax', padding='max_length', truncation=True, max_length=128).data
tokenized_prompt = shard(tokenized_prompt)

In [None]:
tokenized_prompt['input_ids'].shape

In [None]:
images = []
for i in range(num_images // jax.device_count()):
 key, subkey = jax.random.split(key, 2)
 
 encoded_images = p_generate(tokenized_prompt, shard_prng_key(subkey), model_params)
 encoded_images = encoded_images.sequences[..., 1:]
 
 decoded_images = p_decode(encoded_images, vqgan_params)
 decoded_images = decoded_images.clip(0., 1.).reshape((-1, 256, 256, 3))
 
 for img in decoded_images:
 images.append(Image.fromarray(np.asarray(img * 255, dtype=np.uint8)))
 

In [None]:
len(images)

In [None]:
images[0]

In [None]:
images[1]

In [None]:
clip_inputs = processor(text=prompts, images=images, return_tensors='np', padding='max_length', max_length=77, truncation=True).data

In [None]:
# each shard will have one prompt
clip_inputs['input_ids'].shape

In [None]:
# each shard needs to have the images corresponding to a specific prompt
clip_inputs['pixel_values'].shape

In [None]:
images_per_prompt_indices = np.asarray(range(0, len(images), batch_size))
images_per_prompt_indices

In [None]:
# reorder so each shard will have correct images
clip_inputs['pixel_values'] = jnp.concatenate(list(clip_inputs['pixel_values'][images_per_prompt_indices + i] for i in range(batch_size)))

In [None]:
clip_inputs = shard(clip_inputs)

In [None]:
logits = p_clip(clip_inputs)

In [None]:
logits.shape

In [None]:
logits = logits.reshape(-1, num_images)

In [None]:
logits.shape

In [None]:
logits

In [None]:
top_idx = logits.argsort()[:, -top_k:][..., ::-1]

In [None]:
len(images)

In [None]:
results = []
columns = ['Caption', 'Theme'] + [f'Image {i+1}' for i in range(top_k)] + [f'Score {i+1}' for i in range(top_k)]

In [None]:
for i, (idx, scores, sample) in enumerate(zip(top_idx, logits, batch)):
 cur_images = [images[x] for x in images_per_prompt_indices + i]
 top_images = [wandb.Image(cur_images[x]) for x in idx]
 top_scores = [logits[x] for x in idx]
 results.append([sample['Caption'], sample['Theme']] + top_images + top_scores)

In [None]:
wandb.finish()