Spaces:

alvanlii
/

FROMAGe

Runtime error

App Files Files Community

alvanli commited on Feb 19, 2023

Commit

1f43fd8

•

1 Parent(s): 844bec9

Add cheese model

Browse files

Files changed (19) hide show

.gitignore +168 -0
Dockerfile +16 -0
FROMAGe_example_notebook.ipynb +0 -0
README.md +1 -1
app.py +125 -0
example_1.png +0 -0
example_2.png +0 -0
example_3.png +0 -0
fromage/__init__.py +0 -0
fromage/data.py +129 -0
fromage/evaluate.py +307 -0
fromage/losses.py +44 -0
fromage/models.py +658 -0
fromage/utils.py +250 -0
fromage_model/fromage_vis4/cc3m_embeddings.pkl +3 -0
fromage_model/fromage_vis4/model_args.json +16 -0
fromage_model/model_args.json +22 -0
main.py +642 -0
requirements.txt +35 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+.DS_Store
+*.pyc
+__pycache__
+.pytest_cache
+venv
+runs/
+data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel as base
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
+ENV HOME=/exp/fromage
+RUN apt-get update && apt-get -y install git
+WORKDIR /exp/fromage
+COPY ./requirements.txt ./requirements.txt
+RUN python -m pip install -r ./requirements.txt
+RUN python -m pip install gradio
+COPY . .
+RUN chmod -R a+rwX .
+CMD ["uvicorn", "app:main", "--host", "0.0.0.0", "--port", "7860"]

FROMAGe_example_notebook.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: FROMAGe
-emoji: 🐨
 colorFrom: pink
 colorTo: red
 sdk: docker

 ---
 title: FROMAGe
+emoji: 🧀
 colorFrom: pink
 colorTo: red
 sdk: docker

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+eexitimport os, time, copy
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "False"
+from PIL import Image
+import gradio as gr
+import numpy as np
+import torch
+from transformers import logging
+logging.set_verbosity_error()
+from fromage import models
+from fromage import utils
+BASE_WIDTH = 512
+MODEL_DIR = './fromage_model/fromage_vis4'
+def upload_image(file):
+    return Image.open(file)
+def upload_button_config():
+    return gr.update(visible=False)
+def upload_textbox_config(text_in):
+    return gr.update(visible=True)
+class ChatBotCheese:
+    def __init__(self):
+        from huggingface_hub import hf_hub_download
+        model_ckpt_path = hf_hub_download("alvanlii/fromage", "pretrained_ckpt.pth.tar")
+        self.model = models.load_fromage(MODEL_DIR, model_ckpt_path)
+        self.curr_image = None
+        self.chat_history = ''
+    def add_image(self, state, image_in):
+        state = state + [(f"![](/file={image_in.name})", "Ok, now type your message")]
+        self.curr_image = Image.open(image_in.name).convert('RGB')
+        return state, state
+    def save_im(self, image_pil):
+        file_name = f"{int(time.time())}_{np.random.randint(100)}.png"
+        image_pil.save(file_name)
+        return file_name
+    def chat(self, input_text, state, ret_scale_factor, num_ims, num_words, temp):
+        # model_outputs = ["heyo", []]
+        self.chat_history += f'Q: {input_text} \nA:'
+        if self.curr_image is not None:
+            model_outputs = self.model.generate_for_images_and_texts([self.curr_image, self.chat_history], num_words=num_words, max_num_rets=num_ims, ret_scale_factor=ret_scale_factor, temperature=temp)
+        else:
+            model_outputs = self.model.generate_for_images_and_texts([self.chat_history], max_num_rets=num_ims, num_words=num_words, ret_scale_factor=ret_scale_factor, temperature=temp)
+        self.chat_history += ' '.join([s for s in model_outputs if type(s) == str]) + '\n'
+        im_names = []
+        if len(model_outputs) > 1:
+            im_names = [self.save_im(im) for im in model_outputs[1]]
+        response = model_outputs[0]
+        for im_name in im_names:
+            response += f'<img src="/file={im_name}">'
+        state.append((input_text, response.replace("[RET]", "")))
+        self.curr_image = None
+        return state, state
+    def reset(self):
+        self.chat_history = ""
+        self.curr_image = None
+        return [], []
+    def main(self):
+        with gr.Blocks(css="#chatbot .overflow-y-auto{height:1500px}") as demo:
+            gr.Markdown(
+                """
+                ## FROMAGe
+                ### Grounding Language Models to Images for Multimodal Generation
+                Jing Yu Koh, Ruslan Salakhutdinov, Daniel Fried <br/>
+                [Paper](https://arxiv.org/abs/2301.13823) [Github](https://github.com/kohjingyu/fromage) <br/>
+                - Upload an image (optional)
+                - Chat with FROMAGe!
+                - Check out the examples at the bottom!
+                """
+            )
+            chatbot = gr.Chatbot(elem_id="chatbot")
+            gr_state = gr.State([])
+            with gr.Row():
+                with gr.Column(scale=0.85):
+                    txt = gr.Textbox(show_label=False, placeholder="Upload an image first [Optional]. Then enter text and press enter,").style(container=False)
+                with gr.Column(scale=0.15, min_width=0):
+                    btn = gr.UploadButton("🖼️", file_types=["image"])
+            with gr.Row():
+                with gr.Column(scale=0.20, min_width=0):
+                    reset_btn = gr.Button("Reset Messages")
+                gr_ret_scale_factor = gr.Number(value=1.0, label="Increased prob of returning images", interactive=True)
+                gr_num_ims = gr.Number(value=3, precision=1, label="Max # of Images returned", interactive=True)
+                gr_num_words = gr.Number(value=32, precision=1, label="Max # of words returned", interactive=True)
+                gr_temp = gr.Number(value=0.0, label="Temperature", interactive=True)
+            with gr.Row():
+                gr.Image("example_1.png", label="Example 1")
+                gr.Image("example_2.png", label="Example 2")
+                gr.Image("example_3.png", label="Example 3")
+            txt.submit(self.chat, [txt, gr_state, gr_ret_scale_factor, gr_num_ims, gr_num_words, gr_temp], [gr_state, chatbot])
+            txt.submit(lambda :"", None, txt)
+            btn.upload(self.add_image, [gr_state, btn], [gr_state, chatbot])
+            reset_btn.click(self.reset, [], [gr_state, chatbot])
+            # chatbot.change(fn = upload_button_config, outputs=btn_upload)
+            # text_in.submit(None, [], [], _js = "() => document.getElementById('#chatbot-component').scrollTop = document.getElementById('#chatbot-component').scrollHeight")
+        demo.launch(share=False, server_name="0.0.0.0")
+def main():
+    cheddar = ChatBotCheese()
+    cheddar.main()
+if __name__ == "__main__":
+    cheddar = ChatBotCheese()
+    cheddar.main()

example_1.png ADDED Viewed

example_2.png ADDED Viewed

example_3.png ADDED Viewed

fromage/__init__.py ADDED Viewed

File without changes

fromage/data.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Modified from https://github.com/mlfoundations/open_clip"""
+from typing import Optional, Tuple
+import collections
+import logging
+import os
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.datasets as datasets
+from torchvision import transforms as T
+from PIL import Image, ImageFont
+from torch.utils.data import Dataset
+from fromage import utils
+def collate_fn(batch):
+    batch = list(filter(lambda x: x is not None, batch))
+    return torch.utils.data.dataloader.default_collate(batch)
+def get_dataset(args, split: str, tokenizer, precision: str = 'fp32') -> Dataset:
+  assert split in ['train', 'val'
+    ], 'Expected split to be one of "train" or "val", got {split} instead.'
+  dataset_paths = []
+  image_data_dirs = []
+  train = split == 'train'
+  # Default configs for datasets.
+  # Folder structure should look like:
+  if split == 'train':
+    if 'cc3m' in args.dataset:
+      dataset_paths.append(os.path.join(args.dataset_dir, 'cc3m_train.tsv'))
+      image_data_dirs.append(os.path.join(args.image_dir, 'cc3m/training/'))
+    else:
+      raise NotImplementedError
+  elif split == 'val':
+    if 'cc3m' in args.val_dataset:
+      dataset_paths.append(os.path.join(args.dataset_dir, 'cc3m_val.tsv'))
+      image_data_dirs.append(os.path.join(args.image_dir, 'cc3m/validation'))
+    else:
+      raise NotImplementedError
+    assert len(dataset_paths) == len(image_data_dirs) == 1, (dataset_paths, image_data_dirs)
+  else:
+    raise NotImplementedError
+  if len(dataset_paths) > 1:
+    print(f'{len(dataset_paths)} datasets requested: {dataset_paths}')
+    dataset = torch.utils.data.ConcatDataset([
+      CsvDataset(path, image_dir, tokenizer, 'image',
+        'caption', args.visual_model, train=train, max_len=args.max_len, precision=args.precision,
+        image_size=args.image_size, retrieval_token_idx=args.retrieval_token_idx)
+      for (path, image_dir) in zip(dataset_paths, image_data_dirs)])
+  elif len(dataset_paths) == 1:
+    dataset = CsvDataset(dataset_paths[0], image_data_dirs[0], tokenizer, 'image',
+      'caption', args.visual_model, train=train, max_len=args.max_len, precision=args.precision,
+      image_size=args.image_size, retrieval_token_idx=args.retrieval_token_idx)
+  else:
+    raise ValueError(f'There should be at least one valid dataset, got train={args.dataset}, val={args.val_dataset} instead.')
+  return dataset
+class CsvDataset(Dataset):
+  def __init__(self, input_filename, base_image_dir, tokenizer, img_key,
+               caption_key, feature_extractor_model: str,
+               train: bool = True, max_len: int = 32, sep="\t", precision: str = 'fp32',
+               image_size: int = 224, retrieval_token_idx: int = -1):
+    logging.debug(f'Loading tsv data from {input_filename}.')
+    df = pd.read_csv(input_filename, sep=sep)
+    self.base_image_dir = base_image_dir
+    self.images = df[img_key].tolist()
+    self.captions = df[caption_key].tolist()
+    assert len(self.images) == len(self.captions)
+    self.feature_extractor_model = feature_extractor_model
+    self.feature_extractor = utils.get_feature_extractor_for_model(
+      feature_extractor_model, image_size=image_size, train=False)
+    self.image_size = image_size
+    self.tokenizer = tokenizer
+    self.max_len = max_len
+    self.precision = precision
+    self.retrieval_token_idx = retrieval_token_idx
+    self.font = None
+    logging.debug('Done loading data.')
+  def __len__(self):
+    return len(self.captions)
+  def __getitem__(self, idx):
+    while True:
+      image_path = os.path.join(self.base_image_dir, str(self.images[idx]))
+      caption = str(self.captions[idx])
+      try:
+        img = Image.open(image_path)
+        images = utils.get_pixel_values_for_model(self.feature_extractor, img)
+        caption += '[RET]'
+        tokenized_data = self.tokenizer(
+          caption,
+          return_tensors="pt",
+          padding='max_length',
+          truncation=True,
+          max_length=self.max_len)
+        tokens = tokenized_data.input_ids[0]
+        caption_len = tokenized_data.attention_mask[0].sum()
+        decode_caption = self.tokenizer.decode(tokens, skip_special_tokens=False)
+        self.font = self.font or ImageFont.load_default()
+        cap_img = utils.create_image_of_text(decode_caption.encode('ascii', 'ignore'), width=self.image_size, nrows=2, font=self.font)
+        if tokens[-1] not in [self.retrieval_token_idx, self.tokenizer.pad_token_id]:
+          tokens[-1] = self.retrieval_token_idx
+        return image_path, images, cap_img, tokens, caption_len
+      except Exception as e:
+        print(f'Error reading {image_path} with caption {caption}: {e}')
+        # Pick a new example at random.
+        idx = np.random.randint(0, len(self)-1)

fromage/evaluate.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import collections
+import json
+import os
+from PIL import Image
+import numpy as np
+import time
+import tqdm
+import torch
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+from torchmetrics import BLEUScore
+import torchvision
+from fromage import losses as losses_utils
+from fromage import utils
+def validate(val_loader, model, tokenizer, criterion, epoch, args):
+  ngpus_per_node = torch.cuda.device_count()
+  writer = SummaryWriter(args.log_dir)
+  bleu_scorers = [BLEUScore(n_gram=i) for i in [1, 2, 3, 4]]
+  actual_step = (epoch + 1) * args.steps_per_epoch
+  model_modes = ['captioning', 'retrieval']
+  num_words = 32  # Number of tokens to generate.
+  feature_extractor = utils.get_feature_extractor_for_model(args.visual_model, image_size=args.image_size, train=False)
+  def get_pixel_values_from_path(path: str):
+    img = Image.open(path)
+    img = img.resize((args.image_size, args.image_size))
+    pixel_values = utils.get_pixel_values_for_model(feature_extractor, img)[None, ...]
+    if args.precision == 'fp16':
+        pixel_values = pixel_values.half()
+    elif args.precision == 'bf16':
+        pixel_values = pixel_values.bfloat16()
+    if torch.cuda.is_available():
+      pixel_values = pixel_values.cuda()
+    return pixel_values
+  def run_validate(loader, base_progress=0):
+    with torch.no_grad():
+      end = time.time()
+      all_generated_captions = []
+      all_gt_captions = []
+      all_generated_image_paths = []
+      all_image_features = []
+      all_text_features = []
+      for i, (image_paths, images, caption_images, tgt_tokens, token_len) in tqdm.tqdm(enumerate(loader), position=0, total=len(loader)):
+        i = base_progress + i
+        if torch.cuda.is_available():
+          tgt_tokens = tgt_tokens.cuda(args.gpu, non_blocking=True)
+          token_len = token_len.cuda(args.gpu, non_blocking=True)
+          images = images.cuda()
+        if args.precision == 'fp16':
+          images = images.half()
+        elif args.precision == 'bf16':
+          images = images.bfloat16()
+        for model_mode in model_modes:
+          (model_output, full_labels, last_embedding, _, visual_embs) = model(
+            images, tgt_tokens, token_len, mode=model_mode, input_prefix=args.input_prompt, inference=True)  # (N, T, C)
+          if model_mode == 'captioning':
+            loss = args.cap_loss_scale * model_output.loss
+          elif model_mode == 'retrieval':
+            loss = args.ret_loss_scale * model_output.loss
+          else:
+            raise NotImplementedError
+          output = model_output.logits
+          if model_mode == 'captioning':
+            acc1, acc5 = utils.accuracy(output[:, :-1, :], full_labels[:, 1:], -100, topk=(1, 5))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+            ce_losses.update(loss.item(), images.size(0))
+          if model_mode == 'captioning':
+            losses.update(loss.item(), images.size(0))
+          elif model_mode == 'retrieval':
+            if args.distributed:
+              original_last_embedding = torch.clone(last_embedding)
+              all_visual_embs = [torch.zeros_like(visual_embs) for _ in range(dist.get_world_size())]
+              all_last_embedding = [torch.zeros_like(last_embedding) for _ in range(dist.get_world_size())]
+              dist.all_gather(all_visual_embs, visual_embs)
+              dist.all_gather(all_last_embedding, last_embedding)
+              # Overwrite with embeddings produced on this replica, which track the gradients.
+              all_visual_embs[dist.get_rank()] = visual_embs
+              all_last_embedding[dist.get_rank()] = last_embedding
+              visual_embs = torch.cat(all_visual_embs)
+              last_embedding = torch.cat(all_last_embedding)
+              start_idx = args.rank * images.shape[0]
+              end_idx = start_idx + images.shape[0]
+              assert torch.all(last_embedding[start_idx:end_idx] == original_last_embedding), args.rank
+            all_text_features.append(last_embedding.cpu())
+            all_image_features.append(visual_embs.cpu())
+          # Run auto-regressive generation sample
+          if model_mode == 'captioning':
+            input_embs = model.module.model.get_visual_embs(images, mode='captioning')  # (2, n_visual_tokens, D)
+            if args.input_prompt is not None:
+              print(f'Adding prefix "{args.input_prompt}" to captioning generate=True.')
+              prompt_ids = tokenizer(args.input_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+              prompt_ids = prompt_ids.to(visual_embs.device)
+              prompt_embs = model.module.model.input_embeddings(prompt_ids)
+              prompt_embs = prompt_embs.repeat(input_embs.shape[0], 1, 1)
+              input_embs = torch.cat([input_embs, prompt_embs], dim=1)
+            generated_ids, _, _ = model(input_embs, tgt_tokens, token_len,
+              generate=True, num_words=num_words, temperature=0.0, top_p=1.0,
+              min_word_tokens=num_words)
+            if args.distributed and ngpus_per_node > 1:
+              all_generated_ids = [torch.zeros_like(generated_ids) for _ in range(dist.get_world_size())]
+              dist.all_gather(all_generated_ids, generated_ids)
+              all_generated_ids[dist.get_rank()] = generated_ids
+              generated_ids = torch.cat(all_generated_ids)
+              all_tgt_tokens = [torch.zeros_like(tgt_tokens) for _ in range(dist.get_world_size())]
+              dist.all_gather(all_tgt_tokens, tgt_tokens)
+              all_tgt_tokens[dist.get_rank()] = tgt_tokens
+              all_tgt_tokens = torch.cat(all_tgt_tokens)
+              all_image_paths = [[None for _ in image_paths] for _ in range(dist.get_world_size())]
+              dist.all_gather_object(all_image_paths, image_paths)
+              all_image_paths[dist.get_rank()] = image_paths
+              image_paths = []
+              for p in all_image_paths:
+                image_paths.extend(p)
+            else:
+              all_tgt_tokens = tgt_tokens
+            all_tgt_tokens[all_tgt_tokens == -100] = tokenizer.pad_token_id
+            generated_captions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+            gt_captions = tokenizer.batch_decode(all_tgt_tokens, skip_special_tokens=True)
+            for cap_i in range(len(generated_captions)):
+              image_path = image_paths[cap_i]
+              all_generated_image_paths.append(image_path)
+              stop_idx = generated_captions[cap_i].find('.')
+              if stop_idx > 5:
+                all_generated_captions.append(generated_captions[cap_i][:stop_idx])
+              else:
+                all_generated_captions.append(generated_captions[cap_i])
+              all_gt_captions.append([gt_captions[cap_i]])
+          elif model_mode == 'retrieval':
+            if i == 0:
+              # Generate without image input to visualize text-generation ability.
+              input_ids = tgt_tokens[:, :3]  # Use first 3 tokens as initial prompt for generation.
+              input_embs = model.module.model.input_embeddings(input_ids)  # (N, T, D)
+              generated_ids, _, _ = model(input_embs, tgt_tokens, token_len, generate=True, num_words=num_words, temperature=0.0, top_p=1.0)
+              generated_ids = torch.cat([input_ids, generated_ids], dim=1)
+              generated_captions = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
+              gt_captions = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=False)
+          else:
+            raise NotImplementedError
+          if i == 0:
+            max_to_display = 5
+            print('=' * 30)
+            print('Generated samples:')
+            for cap_i, cap in enumerate(generated_captions[:max_to_display]):
+              print(f'{cap_i}) {cap}')
+            print('=' * 30)
+            print('Real samples:')
+            for cap_i, cap in enumerate(gt_captions[:max_to_display]):
+              print(f'{cap_i}) {cap}')
+            print('=' * 30)
+            # Write images and captions to Tensorboard.
+            if not args.distributed or (args.rank % ngpus_per_node == 0):
+              max_images_to_show = 16
+              normalized_images = images - images.min()
+              normalized_images /= normalized_images.max()  # (N, 3, H, W)
+              # Create generated caption text.
+              generated_cap_images = torch.stack([
+                utils.create_image_of_text(
+                  generated_captions[j].encode('ascii', 'ignore'),
+                  width=normalized_images.shape[3],
+                  color=(255, 255, 0))
+                for j in range(normalized_images.shape[0])], axis=0)
+              # Append gt/generated caption images.
+              display_images = torch.cat([normalized_images.float().cpu(), caption_images, generated_cap_images], axis=2)[:max_images_to_show]
+              grid = torchvision.utils.make_grid(display_images, nrow=int(max_images_to_show ** 0.5), padding=4)
+              writer.add_image(f'val/images_{model_mode}', grid, actual_step)
+          # measure elapsed time
+          batch_time.update(time.time() - end)
+          end = time.time()
+        if i % args.print_freq == 0:
+          progress.display(i + 1)
+        if i == args.val_steps_per_epoch - 1:
+          break
+      # Measure captioning metrics.
+      path2captions = collections.defaultdict(list)
+      for image_path, caption in zip(all_generated_image_paths, all_gt_captions):
+        assert len(caption) == 1, caption
+        path2captions[image_path].append(caption[0].replace('[RET]', ''))
+      full_gt_captions = [path2captions[path] for path in all_generated_image_paths]
+      print(f'Computing BLEU with {len(all_generated_captions)} generated captions:'
+            f'{all_generated_captions[:5]} and {len(full_gt_captions)} groundtruth captions:',
+            f'{full_gt_captions[:5]}.')
+      bleu1_score = bleu_scorers[0](all_generated_captions, full_gt_captions)
+      bleu1.update(bleu1_score, 1)
+      bleu2_score = bleu_scorers[1](all_generated_captions, full_gt_captions)
+      bleu2.update(bleu2_score, 1)
+      bleu3_score = bleu_scorers[2](all_generated_captions, full_gt_captions)
+      bleu3.update(bleu3_score, 2)
+      bleu4_score = bleu_scorers[3](all_generated_captions, full_gt_captions)
+      bleu4.update(bleu4_score, 3)
+      # Measure retrieval metrics over the entire validation set.
+      all_image_features = torch.cat(all_image_features, axis=0)  # (coco_val_len, 2048)
+      all_text_features = torch.cat(all_text_features, axis=0)  # (coco_val_len, 2048)
+      print(f"Computing similarity between {all_image_features.shape} and {all_text_features.shape}.")
+      logits_per_image = all_image_features @ all_text_features.t()
+      logits_per_text = logits_per_image.t()
+      all_image_acc1, all_image_acc5 = losses_utils.contrastive_acc(logits_per_image, topk=(1, 5))
+      all_caption_acc1, all_caption_acc5 = losses_utils.contrastive_acc(logits_per_text, topk=(1, 5))
+      image_loss = losses_utils.contrastive_loss(logits_per_image)
+      caption_loss = losses_utils.contrastive_loss(logits_per_text)
+      loss = args.ret_loss_scale * (image_loss + caption_loss) / 2.0
+      losses.update(loss.item(), logits_per_image.size(0))
+      top1_caption.update(all_caption_acc1.item(), logits_per_image.size(0))
+      top5_caption.update(all_caption_acc5.item(), logits_per_image.size(0))
+      top1_image.update(all_image_acc1.item(), logits_per_image.size(0))
+      top5_image.update(all_image_acc5.item(), logits_per_image.size(0))
+  batch_time = utils.AverageMeter('Time', ':6.3f', utils.Summary.AVERAGE)
+  losses = utils.AverageMeter('Loss', ':.4e', utils.Summary.AVERAGE)
+  ce_losses = utils.AverageMeter('CeLoss', ':.4e', utils.Summary.AVERAGE)
+  top1 = utils.AverageMeter('Acc@1', ':6.2f', utils.Summary.AVERAGE)
+  top5 = utils.AverageMeter('Acc@5', ':6.2f', utils.Summary.AVERAGE)
+  bleu1 = utils.AverageMeter('BLEU@1', ':6.2f', utils.Summary.AVERAGE)
+  bleu2 = utils.AverageMeter('BLEU@2', ':6.2f', utils.Summary.AVERAGE)
+  bleu3 = utils.AverageMeter('BLEU@3', ':6.2f', utils.Summary.AVERAGE)
+  bleu4 = utils.AverageMeter('BLEU@4', ':6.2f', utils.Summary.AVERAGE)
+  top1_caption = utils.AverageMeter('CaptionAcc@1', ':6.2f', utils.Summary.AVERAGE)
+  top5_caption = utils.AverageMeter('CaptionAcc@5', ':6.2f', utils.Summary.AVERAGE)
+  top1_image = utils.AverageMeter('ImageAcc@1', ':6.2f', utils.Summary.AVERAGE)
+  top5_image = utils.AverageMeter('ImageAcc@5', ':6.2f', utils.Summary.AVERAGE)
+  progress = utils.ProgressMeter(
+    len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+    [batch_time, losses, top1, top5, bleu4],
+    prefix='Test: ')
+  # switch to evaluate mode
+  model.eval()
+  run_validate(val_loader)
+  if args.distributed:
+    batch_time.all_reduce()
+    losses.all_reduce()
+    bleu1.all_reduce()
+    bleu2.all_reduce()
+    bleu3.all_reduce()
+    bleu4.all_reduce()
+    top1.all_reduce()
+    top5.all_reduce()
+    top1_caption.all_reduce()
+    top5_caption.all_reduce()
+    top1_image.all_reduce()
+    top5_image.all_reduce()
+  if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
+    aux_val_dataset = Subset(val_loader.dataset,
+                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
+    aux_val_loader = torch.utils.data.DataLoader(
+      aux_val_dataset, batch_size=(args.val_batch_size or args.batch_size), shuffle=False,
+      num_workers=args.workers, pin_memory=True, collate_fn=data.collate_fn)
+    run_validate(aux_val_loader, len(val_loader))
+  progress.display_summary()
+  writer.add_scalar('val/total_secs_per_batch', batch_time.avg, actual_step)
+  writer.add_scalar('val/seq_top1_acc', top1.avg, actual_step)
+  writer.add_scalar('val/seq_top5_acc', top5.avg, actual_step)
+  writer.add_scalar('val/ce_loss', losses.avg, actual_step)
+  writer.add_scalar('val/bleu1', bleu1.avg, actual_step)
+  writer.add_scalar('val/bleu2', bleu2.avg, actual_step)
+  writer.add_scalar('val/bleu3', bleu3.avg, actual_step)
+  writer.add_scalar('val/bleu4', bleu4.avg, actual_step)
+  writer.add_scalar('val/contrastive_loss', losses.avg, actual_step)
+  writer.add_scalar('val/t2i_top1_acc', top1_caption.avg, actual_step)
+  writer.add_scalar('val/t2i_top5_acc', top5_caption.avg, actual_step)
+  writer.add_scalar('val/i2t_top1_acc', top1_image.avg, actual_step)
+  writer.add_scalar('val/i2t_top5_acc', top5_image.avg, actual_step)
+  writer.add_scalar('val/top1_acc', (top1_caption.avg + top1_image.avg) / 2.0, actual_step)
+  writer.add_scalar('val/top5_acc', (top5_caption.avg + top5_image.avg) / 2.0, actual_step)
+  writer.close()
+  # Use top1 accuracy as the metric for keeping the best checkpoint.
+  return top1_caption.avg

fromage/losses.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from typing import Optional
+import torch
+from fromage import utils
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+  return torch.nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+def contrastive_acc(logits: torch.Tensor, target: Optional[torch.Tensor] = None, topk=(1,)) -> torch.Tensor:
+  """
+  Args:
+    logits: (N, N) predictions.
+    target: (N, num_correct_answers) labels.
+  """
+  assert len(logits.shape) == 2, logits.shape
+  batch_size = logits.shape[0]
+  if target is None:
+    target = torch.arange(len(logits), device=logits.device)
+    return utils.accuracy(logits, target, -1, topk)
+  else:
+    assert len(target.shape) == 2, target.shape
+    with torch.no_grad():
+      maxk = max(topk)
+      if logits.shape[-1] < maxk:
+        print(f"[WARNING] Less than {maxk} predictions available. Using {logits.shape[-1]} for topk.")
+      maxk = min(maxk, logits.shape[-1])
+      # Take topk along the last dimension.
+      _, pred = logits.topk(maxk, -1, True, True)  # (N, topk)
+      assert pred.shape == (batch_size, maxk)
+      target_expand = target[:, :, None].repeat(1, 1, maxk)  # (N, num_correct_answers, topk)
+      pred_expand = pred[:, None, :].repeat(1, target.shape[1], 1)  # (N, num_correct_answers, topk)
+      correct = pred_expand.eq(target_expand)  # (N, num_correct_answers, topk)
+      correct = torch.any(correct, dim=1)  # (N, topk)
+      res = []
+      for k in topk:
+        any_k_correct = torch.clamp(correct[:, :k].sum(1), max=1)  # (N,)
+        correct_k = any_k_correct.float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+      return res

fromage/models.py ADDED Viewed

	@@ -0,0 +1,658 @@

+from typing import Callable, List, Optional, Tuple, Union
+from collections import namedtuple
+import json
+import glob
+import math
+import numpy as np
+import os
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from functools import partial
+import pickle as pkl
+from PIL import Image, UnidentifiedImageError
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from transformers import OPTForCausalLM, GPT2Tokenizer
+from transformers import CLIPVisionModel, CLIPVisionConfig
+from fromage import utils
+class FrozenArgs:
+  freeze_lm: bool = True
+  freeze_vm: bool = True
+  opt_version: str = 'facebook/opt-6.7b'
+  visual_encoder: str = 'openai/clip-vit-large-patch14'
+  n_visual_tokens: int = 1
+  image_embed_dropout_prob: float = 0.0
+  task: str = 'captioning'
+  shared_emb_dim: Optional[int] = 256
+  text_emb_layers: List[int] = [-1]
+  retrieval_token_idx: int = 0
+class FromageModel(nn.Module):
+  def __init__(self, tokenizer, args: FrozenArgs = FrozenArgs()):
+    super().__init__()
+    self.tokenizer = tokenizer
+    self.feature_extractor = utils.get_feature_extractor_for_model(args.visual_encoder, train=False)
+    self.image_token = self.tokenizer.cls_token_id
+    assert args.text_emb_layers != set(args.text_emb_layers), 'text_emb_layers not unique'
+    self.args = args
+    opt_version = args.opt_version
+    visual_encoder = args.visual_encoder
+    n_visual_tokens = args.n_visual_tokens
+    print(f"Using {opt_version} for the language model.")
+    print(f"Using {visual_encoder} for the visual model with {n_visual_tokens} visual tokens.")
+    self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    if 'facebook/opt' in opt_version:
+      self.lm = OPTForCausalLM.from_pretrained(opt_version)
+    else:
+      raise NotImplementedError
+    self.opt_version = opt_version
+    if self.args.freeze_lm:
+      self.lm.eval()
+      print("Freezing the LM.")
+      for param in self.lm.parameters():
+        param.requires_grad = False
+    else:
+      self.lm.train()
+    self.retrieval_token_idx = args.retrieval_token_idx
+    print(f'Initializing embedding for the retrieval token [RET] (id = {self.retrieval_token_idx}).')
+    self.lm.resize_token_embeddings(len(tokenizer))
+    self.input_embeddings = self.lm.get_input_embeddings()
+    print("Restoring pretrained weights for the visual model.")
+    if 'clip' in visual_encoder:
+      self.visual_model = CLIPVisionModel.from_pretrained(visual_encoder)
+    else:
+      self.visual_model = AutoModel.from_pretrained(visual_encoder)
+    if 'clip' in visual_encoder:
+      hidden_size = self.visual_model.config.hidden_size
+    else:
+      raise NotImplementedError
+    if self.args.freeze_vm:
+      print("Freezing the VM.")
+      self.visual_model.eval()
+      for param in self.visual_model.parameters():
+        param.requires_grad = False
+    else:
+      self.visual_model.train()
+    self.visual_model_name = visual_encoder
+    embedding_dim = self.input_embeddings.embedding_dim * self.args.n_visual_tokens
+    self.text_hidden_fcs = nn.ModuleList([])
+    if self.args.shared_emb_dim is None:
+      if len(self.args.text_emb_layers) == 1:
+        if (self.args.text_emb_layers[0] in [-1, self.lm.config.num_hidden_layers]) and ('bert' not in opt_version):
+          out_dim = self.lm.config.word_embed_proj_dim
+        else:
+          out_dim = self.lm.config.hidden_size
+      else:
+        if (-1 in self.args.text_emb_layers) or (self.lm.config.num_hidden_layers in self.args.text_emb_layers) \
+          and (self.lm.config.word_embed_proj_dim != self.lm.config.hidden_size):
+          raise ValueError('No projection dim specified but model uses last output layer and an intermediate one (which have different dims).')
+        else:
+          out_dim = self.lm.config.hidden_size
+    else:
+      out_dim = self.args.shared_emb_dim
+      for layer_idx in self.args.text_emb_layers:
+        if (layer_idx == -1 or layer_idx == self.lm.config.num_hidden_layers) and ('bert' not in opt_version):
+          in_dim = self.lm.config.word_embed_proj_dim
+          text_fc = [nn.Linear(in_dim, out_dim), nn.Dropout(self.args.text_embed_dropout_prob)]
+          self.text_hidden_fcs.append(nn.Sequential(*text_fc))
+        elif layer_idx < self.lm.config.num_hidden_layers:
+          text_fc = [nn.Linear(self.lm.config.hidden_size, out_dim), nn.Dropout(self.args.text_embed_dropout_prob)]
+          self.text_hidden_fcs.append(nn.Sequential(*text_fc))
+        else:
+          raise ValueError(f'Embedding of layer {layer_idx} was requested but model only has {self.lm.config.num_hidden_layers} layers.')
+    self.visual_embeddings = nn.Linear(hidden_size, embedding_dim)
+    self.visual_fc = nn.Linear(hidden_size, out_dim)
+    self.image_dropout = nn.Dropout(self.args.image_embed_dropout_prob)
+  def get_visual_embs(self, pixel_values: torch.FloatTensor, mode: str = 'captioning'):
+    if mode not in ['captioning', 'retrieval']:
+      raise ValueError(f'mode should be one of ["caption", "retrieval"], got {mode} instead.')
+    # Extract visual embeddings from the vision encoder.
+    if 'clip' in self.visual_model_name:
+      outputs = self.visual_model(pixel_values)
+      encoder_outputs = outputs.pooler_output
+    else:
+      raise NotImplementedError
+    # Use the correct fc based on function argument.
+    if mode == 'captioning':
+      visual_embs = self.visual_embeddings(encoder_outputs)  # (2, D * n_visual_tokens)
+      visual_embs = torch.reshape(visual_embs, (visual_embs.shape[0], self.args.n_visual_tokens, -1))
+    elif mode == 'retrieval':
+      visual_embs = self.visual_fc(encoder_outputs)  # (2, D * n_visual_tokens)
+      visual_embs = torch.reshape(visual_embs, (visual_embs.shape[0], 1, -1))
+    else:
+      raise NotImplementedError
+    visual_embs = self.image_dropout(visual_embs)
+    return visual_embs
+  def train(self, mode=True):
+    super(FromageModel, self).train(mode=mode)
+    # Overwrite train() to ensure Frozen models remain frozen.
+    if self.args.freeze_lm:
+      self.lm.eval()
+    if self.args.freeze_vm:
+      self.visual_model.eval()
+  def forward(
+    self,
+    pixel_values: torch.FloatTensor,
+    labels: torch.LongTensor,
+    caption_len: torch.LongTensor,
+    mode: str = 'captioning',
+    concat_captions: bool = False,
+    input_prefix: Optional[str] = None,
+    inference: bool = False,
+  ):
+    visual_embs = self.get_visual_embs(pixel_values, mode)
+    batch_size, vis_seq_len, _ = visual_embs.shape  # vis_seq_len = n_visual_tokens
+    if labels is not None:
+      assert labels.shape[0] == batch_size, (visual_embs.shape, labels.shape)
+    input_embs = self.input_embeddings(labels)  # (N, T, D)
+    last_embedding_idx = caption_len - 1  # -1 to retrieve the token before the eos token
+    if input_prefix is not None:
+      prompt_ids = self.tokenizer(input_prefix, add_special_tokens=False, return_tensors="pt").input_ids
+      prompt_ids = prompt_ids.to(visual_embs.device)
+      prompt_embs = self.input_embeddings(prompt_ids)
+      prompt_embs = prompt_embs.repeat(batch_size, 1, 1)
+      assert prompt_embs.shape[0] == batch_size, prompt_embs.shape
+      assert prompt_embs.shape[2] == input_embs.shape[2], prompt_embs.shape
+      assert len(prompt_embs.shape) == 3, prompt_embs.shape
+    if mode == 'captioning':
+      # Concat to text embeddings.
+      condition_seq_len = 0
+      if input_prefix is None:
+        # Just add visual embeddings.
+        input_embs = torch.cat([visual_embs, input_embs], axis=1)
+        last_embedding_idx += vis_seq_len
+        condition_seq_len += vis_seq_len
+        full_labels = torch.zeros(visual_embs.shape[:2], dtype=torch.int64).to(visual_embs.device) - 100
+      else:
+        # Add visual and prompt embeddings.
+        prefix_embs = torch.cat([visual_embs, prompt_embs], axis=1)
+        input_embs = torch.cat([prefix_embs, input_embs], axis=1)
+        last_embedding_idx += prefix_embs.shape[1]
+        condition_seq_len += prefix_embs.shape[1]
+        full_labels = torch.zeros(prefix_embs.shape[:2], dtype=torch.int64).to(visual_embs.device) - 100
+      # Mask out embedding tokens in the labels.
+      full_labels = torch.cat([full_labels, labels], axis=1)
+      pad_idx = []
+      for label in full_labels:
+        for k, token in enumerate(label):
+          # Mask out retrieval token if it exists.
+          if token in [self.tokenizer.pad_token_id, self.retrieval_token_idx]:
+            label[k:] = -100
+            pad_idx.append(k)
+            break
+          if k == len(label) - 1:  # No padding found.
+            pad_idx.append(k + 1)
+      assert len(pad_idx) == batch_size, (len(pad_idx), batch_size)
+      bs, seq_len, embs_dim = input_embs.shape
+      if concat_captions:
+        assert len(input_embs.shape) == 3, input_embs
+        assert len(full_labels.shape) == 2, full_labels
+        assert batch_size % 2 == 0
+        all_concat_input_embs = []
+        all_concat_labels = []
+        # Rearrange embeddings and labels (and their padding) to concatenate captions.
+        for i in range(batch_size // 2):
+          first_idx = i * 2
+          second_idx = first_idx + 1
+          first_emb = input_embs[first_idx, :pad_idx[first_idx], :]
+          first_labels = full_labels[first_idx, :pad_idx[first_idx]]
+          first_padding = input_embs[first_idx, pad_idx[first_idx]:, :]
+          first_labels_padding = full_labels[first_idx, pad_idx[first_idx]:]
+          second_emb = input_embs[second_idx, :pad_idx[second_idx], :]
+          second_labels = full_labels[second_idx, :pad_idx[second_idx]]
+          second_padding = input_embs[second_idx, pad_idx[second_idx]:, :]
+          second_labels_padding = full_labels[second_idx, pad_idx[second_idx]:]
+          assert torch.all(first_labels_padding == -100), first_labels_padding
+          assert torch.all(second_labels_padding == -100), second_labels_padding
+          concat_input_embs = torch.cat([first_emb, second_emb, first_padding, second_padding], axis=0)   # (T*2, 768)
+          concat_labels = torch.cat([first_labels, second_labels, first_labels_padding, second_labels_padding], axis=0)   # (T*2, 768)
+          all_concat_input_embs.append(concat_input_embs)
+          all_concat_labels.append(concat_labels)
+        # Pad to max length.
+        input_embs = torch.stack(all_concat_input_embs, axis=0)  # (N/2, T*2, 768)
+        full_labels = torch.stack(all_concat_labels, axis=0)  # (N/2, T*2, 768)
+        assert input_embs.shape == (bs // 2, seq_len * 2, embs_dim), input_embs.shape
+        assert full_labels.shape == (bs // 2, seq_len * 2), full_labels.shape
+      output = self.lm(inputs_embeds=input_embs,
+                       labels=full_labels,
+                       output_hidden_states=True)
+    elif mode == 'retrieval':
+      full_labels = torch.clone(labels)
+      if input_prefix is not None:
+        print(f'Adding prefix "{input_prefix}" to retrieval.')
+        # Add prompt embeddings.
+        prefix_embs = prompt_embs
+        input_embs = torch.cat([prefix_embs, input_embs], axis=1)
+        last_embedding_idx += prefix_embs.shape[1]
+        full_labels = torch.cat([
+          torch.zeros(prefix_embs.shape[:2], dtype=torch.int64).to(labels.device) - 100,
+          full_labels
+        ], axis=1)
+      pad_idx = []
+      for label in full_labels:
+        for k, token in enumerate(label):
+          if token == self.tokenizer.pad_token_id:
+            label[k:] = -100
+            pad_idx.append(k)
+            break
+          if k == len(label) - 1:  # No padding found.
+            pad_idx.append(k + 1)
+      assert len(pad_idx) == batch_size, (len(pad_idx), batch_size)
+      output = self.lm(inputs_embeds=input_embs,
+                       labels=full_labels,
+                       output_hidden_states=True)
+    else:
+      raise NotImplementedError
+    last_embedding = None
+    last_output_logit = None
+    hidden_states = []
+    if mode == 'retrieval':
+      if self.args.shared_emb_dim is not None:
+        for idx, fc_layer in zip(self.args.text_emb_layers, self.text_hidden_fcs):
+          hidden_states.append(fc_layer(output.hidden_states[idx]))  # (N, seq_len, 2048)
+      else:
+        for idx in self.args.text_emb_layers:
+          hidden_states.append(output.hidden_states[idx])
+      # Add hidden states together.
+      last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+      if not concat_captions:
+        last_embedding = torch.stack([last_hidden_state[i, last_embedding_idx[i], :] for i in range(batch_size)], axis=0)  # (N, D)
+        last_output_logit = torch.stack([output.logits[i, last_embedding_idx[i] - 1, :] for i in range(batch_size)], axis=0)  # (N, D)
+      else:
+        # Concatenate two captioning examples together.
+        all_last_embedding = []
+        all_last_output_logit = []
+        for i in range(batch_size // 2):
+          first_last_embedding_idx, second_last_embedding_idx = all_last_embedding_idx[i]
+          first_last_embedding = last_hidden_state[i, first_last_embedding_idx, :]  # (N, D)
+          first_last_output_logit = output.logits[i, first_last_embedding_idx - 1, :]  # (N, D)
+          second_last_embedding = last_hidden_state[i, second_last_embedding_idx, :]  # (N, D)
+          second_last_output_logit = output.logits[i, second_last_embedding_idx - 1, :]  # (N, D)
+          all_last_embedding.append(first_last_embedding)
+          all_last_embedding.append(second_last_embedding)
+          all_last_output_logit.append(first_last_output_logit)
+          all_last_output_logit.append(second_last_output_logit)
+        last_embedding = torch.stack(all_last_embedding)
+        last_output_logit = torch.stack(all_last_output_logit)
+      # Compute retrieval loss.
+      assert visual_embs.shape[1] == 1, visual_embs.shape
+      visual_embs = visual_embs[:, 0, :]
+      visual_embs = visual_embs / visual_embs.norm(dim=1, keepdim=True)
+      last_embedding = last_embedding / last_embedding.norm(dim=1, keepdim=True)
+      # cosine similarity as logits
+      logit_scale = self.logit_scale.exp()
+      visual_embs = logit_scale * visual_embs
+    elif mode == 'captioning':
+      pass
+    else:
+      raise NotImplementedError
+    return output, full_labels, last_embedding, last_output_logit, visual_embs
+  def generate(self, embeddings = torch.FloatTensor, max_len: int = 32,
+               temperature: float = 0.0, top_p: float = 1.0, min_word_tokens: int = 0,
+               ret_scale_factor: float = 1.0, filter_value: float = -float('Inf')):
+    """Runs greedy decoding and returns generated captions.
+    Args:
+      embeddings: Input condition that the model uses for autoregressive generation.
+      max_len: Maximum number of tokens to generate.
+      temperature: Used to modulate logit distribution.
+      top_p: If set to < 1, the smallest set of tokens with highest probabilities that add up to top_p or higher are kept for generation.
+      min_word_tokens: Minimum number of words to generate before allowing a [RET] output.
+      ret_scale_factor: Proportion to scale [RET] token logits by. A higher value may increase the probability of the model generating [RET] outputs.
+      filter_value: Value to assign to tokens that should never be generated.
+    Outputs:
+      out: (N, T) int32 sequence of output tokens.
+      output_embeddings: (N, T, 256) sequence of text output embeddings.
+    """
+    self.lm.eval()
+    with torch.no_grad():  # no tracking history
+      batch_size, s, _ = embeddings.shape
+      # init output with image tokens
+      out = None
+      past_key_values = None
+      output_embeddings = []
+      output_logits = []
+      for i in range(max_len):
+        if 'opt' in self.opt_version:
+          output = self.lm(inputs_embeds=embeddings, use_cache=False, output_hidden_states=True)
+        else:
+          if i == 0:
+            output = self.lm(inputs_embeds=embeddings, use_cache=True, past_key_values=None, output_hidden_states=True)
+          else:
+            output = self.lm(input_ids=out[:, -1:], use_cache=True, past_key_values=past_key_values, output_hidden_states=True)
+        # Collect and sum the hidden states.
+        hidden_states = []
+        if self.args.shared_emb_dim is not None:
+          for idx, fc_layer in zip(self.args.text_emb_layers, self.text_hidden_fcs):
+            hidden_states.append(fc_layer(output.hidden_states[idx]))  # (N, seq_len, 2048)
+        else:
+          for idx in self.args.text_emb_layers:
+            hidden_states.append(output.hidden_states[idx])
+        # Add hidden states together.
+        last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)  # (N, T, 256)
+        last_embedding = last_hidden_state / last_hidden_state.norm(dim=-1, keepdim=True)
+        output_embeddings.append(last_embedding)
+        logits = output.logits[:, -1, :]  # (N, vocab_size)
+        if top_p == 1.0:
+          logits = logits.cpu()
+        output_logits.append(logits)
+        if self.retrieval_token_idx != -1 and self.retrieval_token_idx is not None:
+          if i < min_word_tokens:
+            # Eliminate probability of generating [RET] if this is earlier than min_word_tokens.
+            logits[:, self.retrieval_token_idx] = filter_value
+          else:
+            # Multiply by scaling factor.
+            logits[:, self.retrieval_token_idx] = logits[:, self.retrieval_token_idx] * ret_scale_factor
+        past_key_values = output.past_key_values
+        if temperature == 0.0:
+          if top_p != 1.0:
+            raise ValueError('top_p cannot be set if temperature is 0 (greedy decoding).')
+          next_token = torch.argmax(logits, keepdim=True, dim=-1)  # (N, 1)
+        else:
+          logits = logits / temperature
+          # Apply top-p filtering.
+          if top_p < 1.0:
+            assert top_p > 0, f'top_p should be above 0, got {top_p} instead.'
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)  # (N, D) and (N, D)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # (N, D)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            for j in range(sorted_indices.shape[0]):
+              indices_to_remove = sorted_indices[j, sorted_indices_to_remove[j, :]]
+              logits[j, indices_to_remove] = filter_value
+          token_weights = logits.exp()   # (N, vocab_size)
+          next_token = torch.multinomial(token_weights, 1)  # (N, 1)
+        next_token = next_token.long().to(embeddings.device)
+        if out is not None:
+          out = torch.cat([out, next_token], dim=-1)
+        else:
+          out = next_token
+        if 'opt' in self.opt_version:
+          next_embedding = self.input_embeddings(next_token)
+          embeddings = torch.cat([embeddings, next_embedding], dim=1)
+        elif (self.tokenizer.eos_token_id and (next_token == self.tokenizer.eos_token_id).all()):
+          # End of generation.
+          break
+    return out, output_embeddings, output_logits
+class Fromage(nn.Module):
+  def __init__(self, tokenizer, model_args: Optional[FrozenArgs] = None,
+               path_array: Optional[List[str]] = None, emb_matrix: Optional[torch.tensor] = None):
+    super().__init__()
+    self.model = FromageModel(tokenizer, model_args)
+    self.path_array = path_array
+    self.emb_matrix = emb_matrix
+  def __call__(self, images: Tensor, tgt_tokens: Optional[Tensor] = None, caption_len: Optional[Tensor] = None,
+               generate: bool = False, num_words: int = 32, temperature: float = 1.0, top_p: float = 1.0,
+               ret_scale_factor: float = 1.0, min_word_tokens: int = 0,
+               mode: str = 'captioning', concat_captions: bool = False,
+               input_prefix: Optional[str] = None, inference: bool = False) -> Tensor:
+    if generate:
+      return self.model.generate(images, num_words, temperature=temperature, top_p=top_p,
+                                 min_word_tokens=min_word_tokens, ret_scale_factor=ret_scale_factor)
+    else:
+      output = self.model(
+        pixel_values = images,
+        labels = tgt_tokens,
+        caption_len = caption_len,
+        mode = mode,
+        concat_captions = concat_captions,
+        input_prefix = input_prefix,
+        inference = inference)
+      return output
+  def generate_for_images_and_texts(
+    self, prompts: List, num_words: int = 0, ret_scale_factor: float = 1.0, top_p: float = 1.0, temperature: float = 0.0,
+    max_num_rets: int = 1):
+    """
+    Encode prompts into embeddings.
+    Args:
+      prompts: List of interleaved PIL.Image.Image and strings representing input to the model.
+      num_words: Maximum number of words to generate for. If num_words = 0, the model will run its forward pass and return the outputs.
+      ret_scale_factor: Proportion to scale [RET] token logits by. A higher value may increase the probability of the model generating [RET] outputs.
+      top_p: If set to < 1, the smallest set of tokens with highest probabilities that add up to top_p or higher are kept for generation.
+      temperature: Used to modulate logit distribution.
+      max_num_rets: Maximum number of images to return in one generation pass.
+    Returns:
+      return_outputs: List consisting of either str or List[PIL.Image.Image] objects, representing image-text interleaved model outputs.
+    """
+    input_embs = []
+    input_ids = []
+    add_bos = True
+    for i, p in enumerate(prompts):
+      if type(p) == Image.Image:
+        # Encode as image.
+        pixel_values = utils.get_pixel_values_for_model(self.model.feature_extractor, p)
+        pixel_values = pixel_values.to(device=self.model.logit_scale.device, dtype=self.model.logit_scale.dtype)
+        pixel_values = pixel_values[None, ...]
+        visual_embs = self.model.get_visual_embs(pixel_values, mode='captioning')  # (1, n_visual_tokens, D)
+        input_embs.append(visual_embs)
+      elif type(p) == str:
+        text_ids = self.model.tokenizer(p, add_special_tokens=True, return_tensors="pt").input_ids.to(self.model.logit_scale.device)
+        if not add_bos:
+          # Remove <bos> tag.
+          text_ids = text_ids[:, 1:]
+        else:
+          # Only add <bos> once.
+          add_bos = False
+        text_embs = self.model.input_embeddings(text_ids)  # (1, T, D)
+        input_embs.append(text_embs)
+        input_ids.append(text_ids)
+      else:
+        raise ValueError(f'Input prompts should be either PIL.Image.Image or str types, got {type(p)} instead.')
+    input_embs = torch.cat(input_embs, dim=1)
+    input_ids = torch.cat(input_ids, dim=1)
+    if num_words == 0:
+      generated_ids = input_ids
+      outputs = self.model.lm(inputs_embeds=input_embs, use_cache=False, output_hidden_states=True)
+      # Map outputs to embeddings, so we can retrieve embeddings from the [RET] tokens.
+      out = []
+      for x, fc in zip(self.model.args.text_emb_layers, self.model.text_hidden_fcs):
+          out.append(fc(outputs.hidden_states[x]))
+      embeddings = torch.stack(out, dim=-1).sum(dim=-1)
+      embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)  # (N, T, 256)
+    elif num_words > 0:
+      generated_ids, generated_embeddings, _ = self.model.generate(input_embs, num_words,
+        temperature=temperature, top_p=top_p, ret_scale_factor=ret_scale_factor)
+      embeddings = generated_embeddings[-1][:, input_embs.shape[1]:]
+      # Truncate to newline.
+      newline_token_id = self.model.tokenizer('\n', add_special_tokens=False).input_ids[0]
+      trunc_idx = 0
+      for j in range(generated_ids.shape[1]):
+        if generated_ids[0, j] == newline_token_id:
+          trunc_idx = j
+          break
+      if trunc_idx > 0:
+        generated_ids = generated_ids[:, :trunc_idx]
+        embeddings = embeddings[:, :trunc_idx]
+    else:
+      raise ValueError
+    # Save outputs as an interleaved list.
+    return_outputs = []
+    # Find up to max_num_rets [RET] tokens, and their corresponding scores.
+    all_ret_idx = [i for i, x in enumerate(generated_ids[0, :] == self.model.retrieval_token_idx) if x][:max_num_rets]
+    seen_image_idx = []  # Avoid showing the same image multiple times.
+    last_ret_idx = 0
+    if len(all_ret_idx) == 0:
+      # No [RET] tokens.
+      caption = self.model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+      return_outputs.append(utils.truncate_caption(caption))
+    else:
+      for ret_idx in all_ret_idx:
+        ret_emb = embeddings[:, ret_idx, :]
+        scores = self.emb_matrix @ ret_emb.T
+        # Downweight seen images.
+        for seen_idx in seen_image_idx:
+          scores[seen_idx, :] -= 1000
+        # Get the top 3 images for each image.
+        _, top_image_idx = scores.squeeze().topk(3)
+        image_outputs = []
+        for img_idx in top_image_idx:
+          # Find the first image that does not error out.
+          try:
+            seen_image_idx.append(img_idx)
+            img = utils.get_image_from_url(self.path_array[img_idx])
+            image_outputs.append(img)
+            if len(image_outputs) == max_num_rets:
+              break
+          except UnidentifiedImageError:
+            pass
+        caption = self.model.tokenizer.batch_decode(generated_ids[:, last_ret_idx:ret_idx], skip_special_tokens=True)[0]
+        last_ret_idx = ret_idx + 1
+        return_outputs.append(utils.truncate_caption(caption) + ' [RET]')
+        return_outputs.append(image_outputs)
+    return return_outputs
+def load_fromage(model_dir: str, ckpt_path: str) -> Fromage:
+  model_args_path = os.path.join(model_dir, 'model_args.json')
+  model_ckpt_path = os.path.join(ckpt_path)
+  embs_paths = [s for s in glob.glob(os.path.join(model_dir, 'cc3m_embeddings*.pkl'))]
+  if not os.path.exists(model_args_path):
+    raise ValueError(f'model_args.json does not exist in {model_dir}.')
+  if not os.path.exists(model_ckpt_path):
+    raise ValueError(f'pretrained_ckpt.pth.tar does not exist in {model_dir}.')
+  if len(embs_paths) == 0:
+    raise ValueError(f'cc3m_embeddings_*.pkl files do not exist in {model_dir}.')
+  # Load embeddings.
+  # Construct embedding matrix for nearest neighbor lookup.
+  path_array = []
+  emb_matrix = []
+  # These were precomputed for all CC3M images with `model.get_visual_embs(image, mode='retrieval')`.
+  for p in embs_paths:
+    with open(p, 'rb') as wf:
+        train_embs_data = pkl.load(wf)
+        path_array.extend(train_embs_data['paths'])
+        emb_matrix.append(train_embs_data['embeddings'])
+  emb_matrix = np.concatenate(emb_matrix, axis=0)
+  # Number of paths should be equal to number of embeddings.
+  assert len(path_array) == emb_matrix.shape[0], (len(path_array), emb_matrix.shape[0])
+  with open(model_args_path, 'r') as f:
+      model_kwargs = json.load(f)
+  # Initialize tokenizer.
+  tokenizer = GPT2Tokenizer.from_pretrained(model_kwargs['opt_version'])
+  tokenizer.pad_token = tokenizer.eos_token
+  # Add special tokens to the model to enable [RET].
+  tokenizer.add_special_tokens({"cls_token": "<|image|>"})
+  tokenizer.add_tokens('[RET]')
+  ret_token_idx = tokenizer('[RET]', add_special_tokens=False).input_ids
+  assert len(ret_token_idx) == 1, ret_token_idx
+  model_kwargs['retrieval_token_idx'] = ret_token_idx[0]
+  args = namedtuple('args', model_kwargs)(**model_kwargs)
+  # Initialize model for inference.
+  model = Fromage(tokenizer, args, path_array=path_array, emb_matrix=emb_matrix)
+  model = model.eval()
+  model = model.bfloat16()
+  model = model.cuda()
+  # Load pretrained linear mappings and [RET] embeddings.
+  checkpoint = torch.load(model_ckpt_path)
+  model.load_state_dict(checkpoint['state_dict'], strict=False)
+  with torch.no_grad():
+      model.model.input_embeddings.weight[model.model.retrieval_token_idx, :].copy_(checkpoint['state_dict']['ret_input_embeddings.weight'].cpu().detach())
+  logit_scale = model.model.logit_scale.exp()
+  emb_matrix = torch.tensor(emb_matrix, dtype=logit_scale.dtype).to(logit_scale.device)
+  emb_matrix = emb_matrix / emb_matrix.norm(dim=1, keepdim=True)
+  emb_matrix = logit_scale * emb_matrix
+  model.emb_matrix = emb_matrix
+  return model

fromage/utils.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from enum import Enum
+import subprocess
+import sys
+import shutil
+import torch
+import torch.distributed as dist
+from torchvision.transforms import functional as F
+from torchvision import transforms as T
+from transformers import AutoFeatureExtractor
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import requests
+from io import BytesIO
+import random
+def dump_git_status(out_file=sys.stdout, exclude_file_patterns=['*.ipynb', '*.th', '*.sh', '*.txt', '*.json']):
+  """Logs git status to stdout."""
+  subprocess.call('git rev-parse HEAD', shell=True, stdout=out_file)
+  subprocess.call('echo', shell=True, stdout=out_file)
+  exclude_string = ''
+  subprocess.call('git --no-pager diff -- . {}'.format(exclude_string), shell=True, stdout=out_file)
+def get_image_from_url(url: str):
+    response = requests.get(url)
+    img = Image.open(BytesIO(response.content))
+    img = img.resize((224, 224))
+    img = img.convert('RGB')
+    return img
+def truncate_caption(caption: str) -> str:
+  """Truncate captions at periods and newlines."""
+  trunc_index = caption.find('\n') + 1
+  if trunc_index <= 0:
+      trunc_index = caption.find('.') + 1
+  caption = caption[:trunc_index]
+  return caption
+def pad_to_size(x, size=256):
+  delta_w = size - x.size[0]
+  delta_h = size - x.size[1]
+  padding = (
+    delta_w // 2,
+    delta_h // 2,
+    delta_w - (delta_w // 2),
+    delta_h - (delta_h // 2),
+  )
+  new_im = ImageOps.expand(x, padding)
+  return new_im
+class RandCropResize(object):
+  """
+  Randomly crops, then randomly resizes, then randomly crops again, an image. Mirroring the augmentations from https://arxiv.org/abs/2102.12092
+  """
+  def __init__(self, target_size):
+    self.target_size = target_size
+  def __call__(self, img):
+    img = pad_to_size(img, self.target_size)
+    d_min = min(img.size)
+    img = T.RandomCrop(size=d_min)(img)
+    t_min = min(d_min, round(9 / 8 * self.target_size))
+    t_max = min(d_min, round(12 / 8 * self.target_size))
+    t = random.randint(t_min, t_max + 1)
+    img = T.Resize(t)(img)
+    if min(img.size) < 256:
+      img = T.Resize(256)(img)
+    return T.RandomCrop(size=self.target_size)(img)
+class SquarePad(object):
+  """Pads image to square.
+  From https://discuss.pytorch.org/t/how-to-resize-and-pad-in-a-torchvision-transforms-compose/71850/9
+  """
+  def __call__(self, image):
+    max_wh = max(image.size)
+    p_left, p_top = [(max_wh - s) // 2 for s in image.size]
+    p_right, p_bottom = [max_wh - (s+pad) for s, pad in zip(image.size, [p_left, p_top])]
+    padding = (p_left, p_top, p_right, p_bottom)
+    return F.pad(image, padding, 0, 'constant')
+def create_image_of_text(text: str, width: int = 224, nrows: int = 2, color=(255, 255, 255), font=None) -> torch.Tensor:
+  """Creates a (3, nrows * 14, width) image of text.
+  Returns:
+    cap_img: (3, 14 * nrows, width) image of wrapped text.
+  """
+  height = 12
+  padding = 5
+  effective_width = width - 2 * padding
+  # Create a black image to draw text on.
+  cap_img = Image.new('RGB', (effective_width * nrows, height), color = (0, 0, 0))
+  draw = ImageDraw.Draw(cap_img)
+  draw.text((0, 0), text, color, font=font or ImageFont.load_default())
+  cap_img = F.convert_image_dtype(F.pil_to_tensor(cap_img), torch.float32)  # (3, height, W * nrows)
+  cap_img = torch.split(cap_img, effective_width, dim=-1)  # List of nrow elements of shape (3, height, W)
+  cap_img = torch.cat(cap_img, dim=1)  # (3, height * nrows, W)
+  # Add zero padding.
+  cap_img = torch.nn.functional.pad(cap_img, [padding, padding, 0, padding])
+  return cap_img
+def get_feature_extractor_for_model(model_name: str, image_size: int = 224, train: bool = True):
+  print(f'Using HuggingFace AutoFeatureExtractor for {model_name}.')
+  feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+  return feature_extractor
+def get_pixel_values_for_model(feature_extractor, img):
+  pixel_values = feature_extractor(
+    img.convert('RGB'),
+    return_tensors="pt").pixel_values[0, ...]  # (3, H, W)
+  return pixel_values
+def save_checkpoint(state, is_best, filename='checkpoint'):
+  torch.save(state, filename + '.pth.tar')
+  if is_best:
+    shutil.copyfile(filename + '.pth.tar', filename + '_best.pth.tar')
+def accuracy(output, target, padding, topk=(1,)):
+  """Computes the accuracy over the k top predictions for the specified values of k"""
+  with torch.no_grad():
+    maxk = max(topk)
+    if output.shape[-1] < maxk:
+      print(f"[WARNING] Less than {maxk} predictions available. Using {output.shape[-1]} for topk.")
+    maxk = min(maxk, output.shape[-1])
+    batch_size = target.size(0)
+    # Take topk along the last dimension.
+    _, pred = output.topk(maxk, -1, True, True)  # (N, T, topk)
+    mask = (target != padding).type(target.dtype)
+    target_expand = target[..., None].expand_as(pred)
+    correct = pred.eq(target_expand)
+    correct = correct * mask[..., None].expand_as(correct)
+    res = []
+    for k in topk:
+      correct_k = correct[..., :k].reshape(-1).float().sum(0, keepdim=True)
+      res.append(correct_k.mul_(100.0 / mask.sum()))
+    return res
+def get_params_count(model, max_name_len: int = 60):
+  params = [(name[:max_name_len], p.numel(), str(tuple(p.shape)), p.requires_grad) for name, p in model.named_parameters()]
+  total_trainable_params = sum([x[1] for x in params if x[-1]])
+  total_nontrainable_params = sum([x[1] for x in params if not x[-1]])
+  return params, total_trainable_params, total_nontrainable_params
+def get_params_count_str(model, max_name_len: int = 60):
+  padding = 70  # Hardcoded depending on desired amount of padding and separators.
+  params, total_trainable_params, total_nontrainable_params = get_params_count(model, max_name_len)
+  param_counts_text = ''
+  param_counts_text += '=' * (max_name_len + padding) + '\n'
+  param_counts_text += f'| {"Module":<{max_name_len}} | {"Trainable":<10} | {"Shape":>15} | {"Param Count":>12} |\n'
+  param_counts_text += '-' * (max_name_len + padding) + '\n'
+  for name, param_count, shape, trainable in params:
+    param_counts_text += f'| {name:<{max_name_len}} | {"True" if trainable else "False":<10} | {shape:>15} | {param_count:>12,} |\n'
+  param_counts_text += '-' * (max_name_len + padding) + '\n'
+  param_counts_text += f'| {"Total trainable params":<{max_name_len}} | {"":<10} | {"":<15} | {total_trainable_params:>12,} |\n'
+  param_counts_text += f'| {"Total non-trainable params":<{max_name_len}} | {"":<10} | {"":<15} | {total_nontrainable_params:>12,} |\n'
+  param_counts_text += '=' * (max_name_len + padding) + '\n'
+  return param_counts_text
+class Summary(Enum):
+  NONE = 0
+  AVERAGE = 1
+  SUM = 2
+  COUNT = 3
+class ProgressMeter(object):
+  def __init__(self, num_batches, meters, prefix=""):
+    self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+    self.meters = meters
+    self.prefix = prefix
+  def display(self, batch):
+    entries = [self.prefix + self.batch_fmtstr.format(batch)]
+    entries += [str(meter) for meter in self.meters]
+    print('\t'.join(entries))
+  def display_summary(self):
+    entries = [" *"]
+    entries += [meter.summary() for meter in self.meters]
+    print(' '.join(entries))
+  def _get_batch_fmtstr(self, num_batches):
+    num_digits = len(str(num_batches // 1))
+    fmt = '{:' + str(num_digits) + 'd}'
+    return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+class AverageMeter(object):
+  """Computes and stores the average and current value"""
+  def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+    self.name = name
+    self.fmt = fmt
+    self.summary_type = summary_type
+    self.reset()
+  def reset(self):
+    self.val = 0
+    self.avg = 0
+    self.sum = 0
+    self.count = 0
+  def update(self, val, n=1):
+    self.val = val
+    self.sum += val * n
+    self.count += n
+    self.avg = self.sum / self.count
+  def all_reduce(self):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
+    dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+    self.sum, self.count = total.tolist()
+    self.avg = self.sum / self.count
+  def __str__(self):
+    fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+    return fmtstr.format(**self.__dict__)
+  def summary(self):
+    fmtstr = ''
+    if self.summary_type is Summary.NONE:
+      fmtstr = ''
+    elif self.summary_type is Summary.AVERAGE:
+      fmtstr = '{name} {avg:.3f}'
+    elif self.summary_type is Summary.SUM:
+      fmtstr = '{name} {sum:.3f}'
+    elif self.summary_type is Summary.COUNT:
+      fmtstr = '{name} {count:.3f}'
+    else:
+      raise ValueError('invalid summary type %r' % self.summary_type)
+    return fmtstr.format(**self.__dict__)

fromage_model/fromage_vis4/cc3m_embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a20fa8168bd72e848ff088820b767383dded455a57ac5dd2d97d43e600402195
+size 2979901225

fromage_model/fromage_vis4/model_args.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "opt_version": "facebook/opt-6.7b",
+    "freeze_lm": true,
+    "visual_encoder": "openai/clip-vit-large-patch14",
+    "freeze_vm": true,
+    "n_visual_tokens": 4,
+    "use_image_embed_norm": false,
+    "image_embed_dropout_prob": 0.0,
+    "use_text_embed_layernorm": false,
+    "text_embed_dropout_prob": 0.0,
+    "shared_emb_dim": 256,
+    "text_emb_layers": [
+        -1
+    ],
+    "retrieval_token_idx": 50266
+}

fromage_model/model_args.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "opt_version": "facebook/opt-6.7b",
+    "task": "multitask",
+    "freeze_lm": true,
+    "visual_encoder": "openai/clip-vit-large-patch14",
+    "freeze_vm": true,
+    "pretrained_visual": true,
+    "use_pooler": true,
+    "n_visual_tokens": 1,
+    "image_embed_dropout_prob": 0.0,
+    "text_embed_dropout_prob": 0.0,
+    "shared_emb_dim": 256,
+    "text_emb_layers": [
+        -1
+    ],
+    "append_retrieval_token": true,
+    "num_appended_retrieval_tokens": 1,
+    "input_prompt": "A picture of",
+    "add_input_to_ret": true,
+    "tunable_prompt_length": 0,
+    "retrieval_token_idx": 50266
+}

main.py ADDED Viewed

	@@ -0,0 +1,642 @@

+"""Training example.
+Modified from https://github.com/pytorch/examples/blob/main/imagenet/main.py.
+"""
+import argparse
+import json
+import os
+import sys
+import time
+import warnings
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+from torch.optim.lr_scheduler import StepLR
+from warmup_scheduler import GradualWarmupScheduler
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from torch.utils.tensorboard import SummaryWriter
+import torchvision
+from fromage import data
+from fromage import losses as losses_utils
+from fromage import models
+from fromage import utils
+from fromage import evaluate
+from transformers import AutoTokenizer
+# Disable HuggingFace tokenizer parallelism.
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Available LLM models.
+llm_models = ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b',
+              'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-13b', 'facebook/opt-30b',
+              'facebook/opt-66b']
+datasets = ['cc3m']
+best_score = 0  # Variable to keep track of best model so far.
+def parse_args(args):
+  parser = argparse.ArgumentParser(description='FROMAGe training')
+  parser.add_argument('--opt-version', default='facebook/opt-6.7b',
+            choices=llm_models,
+            help='OPT versions: ' +
+              ' | '.join(llm_models) +
+              ' (default: "facebook/opt-6.7b")')
+  parser.add_argument('--visual-model', default='openai/clip-vit-large-patch14', type=str,
+                      help="Visual encoder to use.")
+  parser.add_argument('-d', '--dataset', metavar='DATASET',  help='Delimited list of datasets:' +
+                      ' | '.join(datasets), default='cc3m',
+                      type=lambda s: [x for x in s.split(',')])
+  parser.add_argument('--val-dataset', metavar='DATASET', default='cc3m',
+            type=lambda s: [x for x in s.split(',')],
+            help='Validation dataset: ' +
+              ' | '.join(datasets) +
+              ' (default: cc3m)')
+  parser.add_argument('--dataset_dir', default='datasets', type=str,
+            help='Dataset directory containing .tsv files.')
+  parser.add_argument('--image-dir', default='./data/', type=str,
+            help='Dataset directory containing image folders.')
+  parser.add_argument('--log-base-dir', default='./runs/', type=str,
+            help='Base directory to write logs and ckpts to.')
+  parser.add_argument('--exp_name', default='frozen', type=str,
+            help='Name of experiment, used for saving checkpoints.')
+  parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+            help='number of data loading workers (default: 4)')
+  parser.add_argument('--epochs', default=10, type=int, metavar='N',
+            help='number of total epochs to run')
+  parser.add_argument('--steps-per-epoch', default=2000, type=int, metavar='N',
+            help='number of training steps per epoch')
+  parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+            help='manual epoch number (useful on restarts)')
+  parser.add_argument('--val-steps-per-epoch', default=-1, type=int, metavar='N',
+            help='number of validation steps per epoch.')
+  parser.add_argument('-b', '--batch-size', default=180, type=int,
+            metavar='N',
+            help='mini-batch size (default: 180), this is the total '
+               'batch size of all GPUs on the current node when '
+               'using Data Parallel or Distributed Data Parallel')
+  parser.add_argument('--val-batch-size', default=None, type=int)
+  parser.add_argument('--lr', '--learning-rate', default=0.0003, type=float,
+            metavar='LR', help='initial learning rate', dest='lr')
+  parser.add_argument('--lr-warmup-steps', default=100, type=int,
+            metavar='N', help='Number of steps to warm up lr.')
+  parser.add_argument('--lr-schedule-step-size', default=10, type=int,
+            metavar='N', help='Number of steps before decaying lr.')
+  parser.add_argument('--lr-schedule-gamma', default=0.1, type=float,
+            metavar='N', help='Decay parameter for learning rate scheduler.')
+  parser.add_argument('--grad-accumulation-steps', default=1, type=int, metavar='N',
+                      help='number of gradient accumulation steps')
+  parser.add_argument('--grad-clip', default=1.0, type=float, help='gradient clipping amount')
+  parser.add_argument('--precision', default='fp32', type=str, choices=['fp32', 'fp16', 'bf16'], help="Precision to train in.")
+  parser.add_argument('--cap-loss-scale', type=float, default=1.0, help="Scale on captioning loss.")
+  parser.add_argument('--ret-loss-scale', type=float, default=1.0, help="Scale on retrieval loss.")
+  parser.add_argument('--concat-captions-prob', type=float, default=0.5, help="Probability of concatenating two examples sequentially for captioning.")
+  parser.add_argument('--concat-for-ret', action='store_true', default=False, help="Whether to concatenate examples for retrieval mode.")
+  parser.add_argument('--input-prompt', default=None, type=str, help="Input prompt for the language model, if any.")
+  parser.add_argument('--image-size', default=224, type=int, metavar='N', help='Size of images.')
+  parser.add_argument('--use_image_embed_norm', action='store_true', default=False, help="Whether to use norm on the image embeddings to make them equal to language.")
+  parser.add_argument('--image_embed_dropout_prob', type=float, default=0.0, help="Dropout probability on the image embeddings.")
+  parser.add_argument('--use_text_embed_layernorm', action='store_true', default=False, help="Whether to use layer norm on the text embeddings for retrieval.")
+  parser.add_argument('--text_embed_dropout_prob', type=float, default=0.0, help="Dropout probability on the text embeddings.")
+  parser.add_argument('--shared-emb-dim', default=256, type=int, metavar='N', help='Embedding dimension for retrieval.')
+  parser.add_argument('--text-emb-layers', help='Layer to use for text embeddings. OPT-2.7b has 33 layers.', default='-1',
+                      type=lambda s: [int(x) for x in s.split(',')])
+  parser.add_argument('--max-len', default=24, type=int,
+            metavar='N', help='Maximum length to truncate captions / generations to.')
+  parser.add_argument('--n-visual-tokens', default=1, type=int,
+            metavar='N', help='Number of visual tokens to use for the Frozen model.')
+  parser.add_argument('--beta1', default=0.9, type=float, metavar='M', help='beta1 for Adam')
+  parser.add_argument('--beta2', default=0.95, type=float, metavar='M', help='beta2 for Adam')
+  parser.add_argument('--wd', '--weight-decay', default=0.0, type=float,
+            metavar='W', help='weight decay (default: 0.0)', dest='weight_decay')
+  parser.add_argument('-p', '--print-freq', default=10, type=int,
+            metavar='N', help='print frequency (default: 10)')
+  parser.add_argument('--resume', default='', type=str, metavar='PATH',
+            help='path to latest checkpoint (default: none)')
+  parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+            help='evaluate model on validation set')
+  parser.add_argument('--world-size', default=-1, type=int,
+            help='number of nodes for distributed training')
+  parser.add_argument('--rank', default=-1, type=int,
+            help='node rank for distributed training')
+  parser.add_argument('--dist-url', default='tcp://127.0.0.1:1337', type=str,
+            help='url used to set up distributed training')
+  parser.add_argument('--dist-backend', default='nccl', type=str,
+            help='distributed backend')
+  parser.add_argument('--seed', default=None, type=int,
+            help='seed for initializing training. ')
+  parser.add_argument('--gpu', default=None, type=int,
+            help='GPU id to use.')
+  parser.add_argument('--multiprocessing-distributed', action='store_true',
+            help='Use multi-processing distributed training to launch '
+               'N processes per node, which has N GPUs. This is the '
+               'fastest way to use PyTorch for either single node or '
+               'multi node data parallel training')
+  return parser.parse_args(args)
+def main(args):
+  args = parse_args(args)
+  i = 1
+  args.log_dir = os.path.join(args.log_base_dir, args.exp_name)
+  while os.path.exists(args.log_dir):
+    args.log_dir = os.path.join(args.log_base_dir, f'{args.exp_name}_{i}')
+    i += 1
+  os.makedirs(args.log_dir)
+  with open(os.path.join(args.log_dir, f'args.json'), 'w') as wf:
+    json.dump(vars(args), wf, indent=4)
+  with open(os.path.join(args.log_dir, f'git_info.txt'), 'w') as wf:
+    utils.dump_git_status(out_file=wf)
+  print(f'Logging to {args.log_dir}.')
+  if args.seed is not None:
+    torch.manual_seed(args.seed)
+    cudnn.deterministic = True
+    warnings.warn('You have chosen to seed training. '
+            'This will turn on the CUDNN deterministic setting, '
+            'which can slow down your training considerably! '
+            'You may see unexpected behavior when restarting '
+            'from checkpoints.')
+  if args.gpu is not None:
+    warnings.warn('You have chosen a specific GPU. This will completely '
+            'disable data parallelism.')
+  if args.dist_url == "env://" and args.world_size == -1:
+    args.world_size = int(os.environ["WORLD_SIZE"])
+  args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+  ngpus_per_node = torch.cuda.device_count()
+  if args.multiprocessing_distributed:
+    # Since we have ngpus_per_node processes per node, the total world_size
+    # needs to be adjusted accordingly
+    args.world_size = ngpus_per_node * args.world_size
+    # Use torch.multiprocessing.spawn to launch distributed processes: the
+    # main_worker process function
+    mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+  else:
+    # Simply call main_worker function
+    main_worker(args.gpu, ngpus_per_node, args)
+def main_worker(gpu, ngpus_per_node, args):
+  """Setup code."""
+  global best_score
+  args.gpu = gpu
+  if args.gpu is not None:
+    print("Use GPU: {} for training".format(args.gpu))
+  if args.distributed:
+    if args.dist_url == "env://" and args.rank == -1:
+      args.rank = int(os.environ["RANK"])
+    if args.multiprocessing_distributed:
+      # For multiprocessing distributed training, rank needs to be the
+      # global rank among all the processes
+      args.rank = args.rank * ngpus_per_node + gpu
+    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                world_size=args.world_size, rank=args.rank)
+  # Create model
+  model_args = models.FrozenArgs()
+  model_args.opt_version = args.opt_version
+  model_args.freeze_lm = True
+  model_args.visual_encoder = args.visual_model
+  model_args.freeze_vm = True
+  model_args.n_visual_tokens = args.n_visual_tokens
+  model_args.use_image_embed_norm = args.use_image_embed_norm
+  model_args.image_embed_dropout_prob = args.image_embed_dropout_prob
+  model_args.use_text_embed_layernorm = args.use_text_embed_layernorm
+  model_args.text_embed_dropout_prob = args.text_embed_dropout_prob
+  model_args.shared_emb_dim = args.shared_emb_dim
+  model_args.text_emb_layers = args.text_emb_layers
+  tokenizer = AutoTokenizer.from_pretrained(args.opt_version, use_fast=False)
+  # Add an image token for loss masking (and visualization) purposes.
+  tokenizer.add_special_tokens({"cls_token": "<|image|>"})  # add special image token to tokenizer
+  print('Adding [RET] token to vocabulary.')
+  print('Before adding new token, tokenizer("[RET]") =', tokenizer('[RET]', add_special_tokens=False))
+  num_added_tokens = tokenizer.add_tokens('[RET]')
+  print(f'After adding {num_added_tokens} new tokens, tokenizer("[RET]") =', tokenizer('[RET]', add_special_tokens=False))
+  ret_token_idx = tokenizer('[RET]', add_special_tokens=False).input_ids
+  assert len(ret_token_idx) == 1, ret_token_idx
+  model_args.retrieval_token_idx = ret_token_idx[0]
+  args.retrieval_token_idx = ret_token_idx[0]
+  # Save model args to disk.
+  with open(os.path.join(args.log_dir, 'model_args.json'), 'w') as f:
+    json.dump(vars(model_args), f, indent=4)
+  model = models.Fromage(tokenizer, model_args)
+  if args.precision == 'fp16':
+    model = model.float()
+  elif args.precision == 'bf16':
+    model = model.bfloat16()
+  # Print parameters and count of model.
+  param_counts_text = utils.get_params_count_str(model)
+  with open(os.path.join(args.log_dir, 'param_count.txt'), 'w') as f:
+    f.write(param_counts_text)
+  # Log trainable parameters to Tensorboard.
+  _, total_trainable_params, total_nontrainable_params = utils.get_params_count(model)
+  writer = SummaryWriter(args.log_dir)
+  writer.add_scalar('params/total', total_trainable_params + total_nontrainable_params, 0)
+  writer.add_scalar('params/total_trainable', total_trainable_params, 0)
+  writer.add_scalar('params/total_non_trainable', total_nontrainable_params, 0)
+  writer.close()
+  if not torch.cuda.is_available():
+    print('WARNING: using CPU, this will be slow!')
+    model = torch.nn.DataParallel(model)
+  elif args.distributed:
+    # For multiprocessing distributed, DistributedDataParallel constructor
+    # should always set the single device scope, otherwise,
+    # DistributedDataParallel will use all available devices.
+    if args.gpu is not None:
+      torch.cuda.set_device(args.gpu)
+      model.cuda(args.gpu)
+      # When using a single GPU per process and per
+      # DistributedDataParallel, we need to divide the batch size
+      # ourselves based on the total number of GPUs of the current node.
+      args.batch_size = int(args.batch_size / ngpus_per_node)
+      args.val_batch_size = int((args.val_batch_size or args.batch_size) / ngpus_per_node)
+      args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+      model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=False)
+    else:
+      model.cuda()
+      # DistributedDataParallel will divide and allocate batch_size to all
+      # available GPUs if device_ids are not set
+      model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=False)
+  elif args.gpu is not None:
+    torch.cuda.set_device(args.gpu)
+    model = model.cuda(args.gpu)
+  else:
+    model = torch.nn.DataParallel(model).cuda()
+  # define loss function (criterion), optimizer, and learning rate scheduler
+  criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+  optimizer_cls = torch.optim.AdamW
+  print('Using torch.optim.AdamW as the optimizer.')
+  optimizer = optimizer_cls(model.parameters(), args.lr,
+                betas=(args.beta1, args.beta2),
+                weight_decay=args.weight_decay,
+                eps=1e-8)
+  """Sets the learning rate to the initial LR decayed by 10 every 5 epochs"""
+  scheduler_steplr = StepLR(optimizer, step_size=args.lr_schedule_step_size * args.steps_per_epoch, gamma=args.lr_schedule_gamma)
+  scheduler = GradualWarmupScheduler(optimizer, multiplier=1.0, total_epoch=args.lr_warmup_steps, after_scheduler=scheduler_steplr)
+  # optionally resume from a checkpoint
+  if args.resume:
+    if os.path.isfile(args.resume):
+      print("=> loading checkpoint '{}'".format(args.resume))
+      if args.gpu is None:
+        checkpoint = torch.load(args.resume)
+      else:
+        # Map model to be loaded to specified single gpu.
+        loc = 'cuda:{}'.format(args.gpu)
+        checkpoint = torch.load(args.resume, map_location=loc)
+      args.start_epoch = checkpoint['epoch']
+      best_score = checkpoint['best_score']
+      if args.gpu is not None:
+        # best_score may be from a checkpoint from a different GPU
+        best_score = best_score.to(args.gpu)
+      model.load_state_dict(checkpoint['state_dict'])
+      optimizer.load_state_dict(checkpoint['optimizer'])
+      scheduler.load_state_dict(checkpoint['scheduler'])
+      print("=> loaded checkpoint '{}' (epoch {})"
+          .format(args.resume, checkpoint['epoch']))
+    else:
+      print("=> no checkpoint found at '{}'".format(args.resume))
+  cudnn.benchmark = True
+  # Data loading code
+  train_dataset = data.get_dataset(args, 'train', tokenizer)
+  val_dataset = data.get_dataset(args, 'val', tokenizer)
+  print(f'Training with {len(train_dataset)} examples and validating with {len(val_dataset)} examples.')
+  if args.distributed:
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, drop_last=True)
+    val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+  else:
+    train_sampler = None
+    val_sampler = None
+  train_loader = torch.utils.data.DataLoader(
+    train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+    num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+  val_loader = torch.utils.data.DataLoader(
+    val_dataset, batch_size=(args.val_batch_size or args.batch_size), shuffle=False,
+    num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+  if args.evaluate:
+    evaluate.validate(val_loader, model, tokenizer, criterion, epoch, args)
+    return
+  for epoch in range(args.start_epoch, args.epochs):
+    if epoch == 0:
+      evaluate.validate(val_loader, model, tokenizer, criterion, epoch-1, args)
+    if args.distributed:
+      train_sampler.set_epoch(epoch)
+    # train for one epoch
+    train(train_loader, model, tokenizer, criterion, optimizer, epoch, scheduler, args)
+    # evaluate on validation set
+    eval_score = evaluate.validate(val_loader, model, tokenizer, criterion, epoch, args)
+    # remember best score and save checkpoint
+    is_best = eval_score > best_score
+    best_score = max(eval_score, best_score)
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+        and args.rank % ngpus_per_node == 0):
+      utils.save_checkpoint({
+        'epoch': epoch + 1,
+        'state_dict': model.state_dict(),
+        'best_score': best_score,
+        'optimizer' : optimizer.state_dict(),
+        'scheduler' : scheduler.state_dict()
+      }, is_best, os.path.join(args.log_dir, 'ckpt'))
+def train(train_loader, model, tokenizer, criterion, optimizer, epoch, scheduler, args):
+  """Main training loop."""
+  ngpus_per_node = torch.cuda.device_count()
+  batch_time = utils.AverageMeter('Time', ':6.3f')
+  cap_time = utils.AverageMeter('CaptioningTime', ':6.3f')
+  ret_time = utils.AverageMeter('RetrievalTime', ':6.3f')
+  data_time = utils.AverageMeter('Data', ':6.3f')
+  losses = utils.AverageMeter('Loss', ':.4e')
+  ce_losses = utils.AverageMeter('CeLoss', ':.4e')
+  top1 = utils.AverageMeter('Acc@1', ':6.2f')
+  top5 = utils.AverageMeter('Acc@5', ':6.2f')
+  cont_losses = utils.AverageMeter('ContLoss', ':.4e')
+  top1_caption = utils.AverageMeter('AccCaption@1', ':6.2f')
+  top5_caption = utils.AverageMeter('AccCaption@5', ':6.2f')
+  top1_image = utils.AverageMeter('AccImage@1', ':6.2f')
+  top5_image = utils.AverageMeter('AccImage@5', ':6.2f')
+  writer = SummaryWriter(args.log_dir)
+  progress = utils.ProgressMeter(
+    args.steps_per_epoch,
+    [batch_time, losses, ce_losses, cont_losses, top1, top5],
+    prefix="Epoch: [{}]".format(epoch))
+  # switch to train mode
+  model.train()
+  end = time.time()
+  for i, (image_paths, images, caption_images, tgt_tokens, token_len) in enumerate(train_loader):
+    actual_step = epoch * args.steps_per_epoch + i + 1
+    # measure data loading time
+    data_time.update(time.time() - end)
+    if torch.cuda.is_available():
+      images = images.cuda(args.gpu, non_blocking=True)
+      tgt_tokens = tgt_tokens.cuda(args.gpu, non_blocking=True)
+      token_len = token_len.cuda(args.gpu, non_blocking=True)
+    if args.precision == 'fp16':
+      images = images.half()
+    elif args.precision == 'bf16':
+      images = images.bfloat16()
+    model_modes = ['captioning', 'retrieval']
+    loss = 0
+    for model_mode in model_modes:
+      mode_start = time.time()
+      # compute output
+      concat_captions = np.random.uniform(0, 1) < args.concat_captions_prob
+      if not args.concat_for_ret:
+        concat_captions = concat_captions and model_mode == 'captioning'
+      (model_output, full_labels, last_embedding, _, visual_embs) = model(
+        images, tgt_tokens, token_len, mode=model_mode, concat_captions=concat_captions, inference=False)
+      output = model_output.logits
+      # Measure captioning accuracy for multi-task models and next-token prediction for retrieval models.
+      if model_mode == 'captioning':
+        acc1, acc5 = utils.accuracy(output[:, :-1, :], full_labels[:, 1:], -100, topk=(1, 5))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+      ce_loss = model_output.loss
+      if model_mode == 'captioning':
+        ce_loss = ce_loss * args.cap_loss_scale
+      elif model_mode == 'retrieval':
+        ce_loss = ce_loss * args.ret_loss_scale
+      else:
+        raise NotImplementedError
+      loss += ce_loss
+      ce_losses.update(ce_loss.item(), images.size(0))
+      if model_mode == 'retrieval':
+        # Cross replica concat for embeddings.
+        if args.distributed:
+          all_visual_embs = [torch.zeros_like(visual_embs) for _ in range(dist.get_world_size())]
+          all_last_embedding = [torch.zeros_like(last_embedding) for _ in range(dist.get_world_size())]
+          dist.all_gather(all_visual_embs, visual_embs)
+          dist.all_gather(all_last_embedding, last_embedding)
+          # Overwrite with embeddings produced on this replace, which have the gradient.
+          all_visual_embs[dist.get_rank()] = visual_embs
+          all_last_embedding[dist.get_rank()] = last_embedding
+          visual_embs = torch.cat(all_visual_embs)
+          last_embedding = torch.cat(all_last_embedding)
+          start_idx = args.rank * images.shape[0]
+          end_idx = start_idx + images.shape[0]
+        logits_per_image = visual_embs @ last_embedding.t()
+        logits_per_text = logits_per_image.t()
+        if i == 0:
+          print(f'Running contrastive loss over logits_per_text.shape = {logits_per_text.shape} and logits_per_image.shape = {logits_per_image.shape}')
+        # Compute contrastive losses for retrieval.
+        caption_loss = losses_utils.contrastive_loss(logits_per_text)
+        image_loss = losses_utils.contrastive_loss(logits_per_image)
+        caption_acc1, caption_acc5 = losses_utils.contrastive_acc(logits_per_text, topk=(1, 5))
+        image_acc1, image_acc5 = losses_utils.contrastive_acc(logits_per_image, topk=(1, 5))
+        loss += args.ret_loss_scale * (caption_loss + image_loss) / 2.0
+        cont_losses.update(loss.item(), images.size(0))
+        # measure accuracy and record loss
+        top1_caption.update(caption_acc1[0], images.size(0))
+        top5_caption.update(caption_acc5[0], images.size(0))
+        top1_image.update(image_acc1[0], images.size(0))
+        top5_image.update(image_acc5[0], images.size(0))
+      if model_mode == 'retrieval':
+        ret_time.update(time.time() - mode_start)
+      elif model_mode == 'captioning':
+        cap_time.update(time.time() - mode_start)
+    loss = loss / args.grad_accumulation_steps
+    losses.update(loss.item(), images.size(0))
+    loss.backward()
+    # Update weights
+    if ((i + 1) % args.grad_accumulation_steps == 0) or (i == args.steps_per_epoch - 1):
+      # Zero out gradients of the embedding matrix outside of [RET].
+      for param in model.module.model.input_embeddings.parameters():
+        assert param.grad.shape[0] == len(tokenizer)
+        # Keep other embeddings frozen.
+        mask = torch.arange(param.grad.shape[0]) != args.retrieval_token_idx
+        param.grad[mask, :] = 0
+      # compute gradient and do SGD step
+      if args.grad_clip > 0:
+        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+      optimizer.step()
+      optimizer.zero_grad()
+    with torch.no_grad():
+      # Normalize trainable embeddings.
+      frozen_norm = torch.norm(model.module.model.input_embeddings.weight[:-1, :], dim=1).mean(0)
+      trainable_weight = model.module.model.input_embeddings.weight[-1, :]
+      model.module.model.input_embeddings.weight[-1, :].div_(torch.norm(trainable_weight) / frozen_norm)
+    # measure elapsed time
+    batch_time.update(time.time() - end)
+    end = time.time()
+    if actual_step == 1 or (i + 1) % args.print_freq == 0:
+      ex_per_sec = args.batch_size / batch_time.avg
+      if args.distributed:
+        batch_time.all_reduce()
+        data_time.all_reduce()
+        ex_per_sec = (args.batch_size / batch_time.avg) * ngpus_per_node
+        losses.all_reduce()
+        ce_losses.all_reduce()
+        top1.all_reduce()
+        top5.all_reduce()
+        ret_time.all_reduce()
+        cont_losses.all_reduce()
+        top1_caption.all_reduce()
+        top5_caption.all_reduce()
+        top1_image.all_reduce()
+        top5_image.all_reduce()
+        cap_time.all_reduce()
+      progress.display(i + 1)
+      writer.add_scalar('train/loss', losses.avg, actual_step)
+      writer.add_scalar('train/ce_loss', ce_losses.avg, actual_step)
+      writer.add_scalar('train/seq_top1_acc', top1.avg, actual_step)
+      writer.add_scalar('train/seq_top5_acc', top5.avg, actual_step)
+      writer.add_scalar('train/contrastive_loss', cont_losses.avg, actual_step)
+      writer.add_scalar('train/t2i_top1_acc', top1_caption.avg, actual_step)
+      writer.add_scalar('train/t2i_top5_acc', top5_caption.avg, actual_step)
+      writer.add_scalar('train/i2t_top1_acc', top1_image.avg, actual_step)
+      writer.add_scalar('train/i2t_top5_acc', top5_image.avg, actual_step)
+      writer.add_scalar('metrics/total_secs_per_batch', batch_time.avg, actual_step)
+      writer.add_scalar('metrics/total_secs_captioning', cap_time.avg, actual_step)
+      writer.add_scalar('metrics/total_secs_retrieval', ret_time.avg, actual_step)
+      writer.add_scalar('metrics/data_secs_per_batch', data_time.avg, actual_step)
+      writer.add_scalar('metrics/examples_per_sec', ex_per_sec, actual_step)
+      if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+        and args.rank % ngpus_per_node == 0):
+        image_bs = images.shape[0]
+        normalized_images = images - images.min()
+        normalized_images /= normalized_images.max()  # (N, 3, H, W)
+        max_images_to_show = 16
+        # Append caption text.
+        pred_tokens = output[:, args.n_visual_tokens-1:-1, :].argmax(dim=-1)
+        generated_captions = tokenizer.batch_decode(pred_tokens, skip_special_tokens=False)
+        # Log image (and generated caption) outputs to Tensorboard.
+        if model_mode == 'captioning':
+          # Create generated caption text.
+          generated_cap_images = torch.stack([
+            utils.create_image_of_text(
+              generated_captions[i].encode('ascii', 'ignore'),
+              width=normalized_images.shape[3],
+              color=(255, 255, 0))
+            for i in range(len(generated_captions))], axis=0)
+          # Duplicate captions if we concatenated them.
+          if (args.concat_captions_prob > 0 and model_mode == 'captioning' and generated_cap_images.shape[0] != caption_images.shape[0]):
+            generated_cap_images = torch.cat([generated_cap_images, generated_cap_images], axis=0)
+          display_images = torch.cat([normalized_images.float().cpu(), caption_images, generated_cap_images], axis=2)[:max_images_to_show]
+          grid = torchvision.utils.make_grid(display_images, nrow=int(max_images_to_show ** 0.5), padding=4)
+          writer.add_image('train/images_gen_cap', grid, actual_step)
+        # Retrieved images (from text).
+        retrieved_image_idx = logits_per_text[:image_bs, :image_bs].argmax(-1)
+        t2i_images = torch.stack(
+          [normalized_images[retrieved_image_idx[i], ...] for i in range(len(retrieved_image_idx))],
+          axis=0)
+        t2i_images = torch.cat([t2i_images.float().cpu(), caption_images], axis=2)[:max_images_to_show]
+        t2i_grid = torchvision.utils.make_grid(t2i_images, nrow=int(max_images_to_show ** 0.5), padding=4)
+        writer.add_image('train/t2i_ret', t2i_grid, actual_step)
+        # Retrieved text (from image).
+        retrieved_text_idx = logits_per_image[:image_bs, :image_bs].argmax(-1)
+        retrieved_text = torch.stack(
+          [caption_images[retrieved_text_idx[i], ...] for i in range(len(retrieved_text_idx))],
+          axis=0)
+        i2t_images = torch.cat([normalized_images.float().cpu(), retrieved_text], axis=2)[:max_images_to_show]
+        i2t_grid = torchvision.utils.make_grid(i2t_images, nrow=int(max_images_to_show ** 0.5), padding=4)
+        writer.add_image('train/i2t_ret', i2t_grid, actual_step)
+      batch_time.reset()
+      cap_time.reset()
+      ret_time.reset()
+      data_time.reset()
+      losses.reset()
+      ce_losses.reset()
+      top1.reset()
+      top5.reset()
+      cont_losses.reset()
+      top1_caption.reset()
+      top5_caption.reset()
+      top1_image.reset()
+      top5_image.reset()
+    if i == args.steps_per_epoch - 1:
+      break
+    scheduler.step()
+    curr_lr = scheduler.get_last_lr()
+    if (actual_step == 1) or (i + 1) % args.print_freq == 0:
+      # Write current learning rate to Tensorboard.
+      writer = SummaryWriter(args.log_dir)
+      writer.add_scalar('train/lr', curr_lr[0], actual_step)
+      writer.close()
+  writer.close()
+if __name__ == '__main__':
+  main(sys.argv[1:])

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+attrs==22.2.0
+certifi==2022.12.7
+charset-normalizer
+contourpy==1.0.7
+cycler==0.11.0
+einops==0.4.1
+exceptiongroup==1.1.0
+filelock==3.9.0
+fonttools==4.38.0
+huggingface-hub==0.12.0
+idna==3.4
+iniconfig==2.0.0
+kiwisolver==1.4.4
+matplotlib
+numpy
+packaging==23.0
+Pillow==9.4.0
+pluggy==1.0.0
+pyparsing==3.0.9
+pytest==7.2.1
+python-dateutil==2.8.2
+PyYAML
+regex
+requests
+six==1.16.0
+tokenizers==0.12.1
+tomli==2.0.1
+torchaudio==0.11.0
+torchmetrics==0.9.3
+torchvision==0.12.0
+tqdm
+transformers==4.21.3
+typing_extensions==4.4.0
+urllib3==1.26.14
+warmup-scheduler