File size: 3,436 Bytes
fa1a600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Copyright 2022 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model definition to train a single ViT model with the contrastive trainer."""

import importlib
from typing import Optional, Any

from big_vision import utils
import flax.linen as nn
import jax.numpy as jnp

ConfigDict = Any


class Model(nn.Module):
  """Single ViT to encode regular images and text images."""
  image: Optional[ConfigDict] = None
  image_model: str = "vit"
  out_dim: int = 768
  temperature_init: float = 10.0

  @nn.compact
  def __call__(self, image, text=None, **kw):
    """Returns (B, C) image and (B, C) text representations, and some extras."""
    ztxt, zimg = None, None
    kw = kw or {}

    image_model = importlib.import_module(
        f"big_vision.models.{self.image_model}"
    ).Model(**{"num_classes": self.out_dim, **(self.image or {})}, name="img")  # pylint: disable=not-a-mapping

    def _compute_embedding(input_image, prefix):
      zemb, out_emb = image_model(input_image, **kw)
      out = {f"{prefix}/{k}": v for k, v in out_emb.items()}

      # Normalize the embeddings.
      out[f"{prefix}/norm"] = jnp.linalg.norm(zemb, axis=1, keepdims=True)
      out[f"{prefix}/normalized"] = zemb = zemb / (out[f"{prefix}/norm"] + 1e-8)
      return zemb, out

    out = {}
    if image is not None:
      zimg, out_img = _compute_embedding(image, "img")
      out.update(out_img)

    if text is not None:
      ztxt, out_txt = _compute_embedding(text, "txt")
      out.update(out_txt)

    temp_init = jnp.log(self.temperature_init)
    t = self.param("t",
                   lambda key, shape, dtype: temp_init*jnp.ones(shape, dtype),
                   (1,), jnp.float32)
    out["t"] = jnp.exp(t)
    out["t/parameter"] = t

    return zimg, ztxt, out


def load(init_params, init_files, model_cfg, img_load_kw={}):  # pylint: disable=dangerous-default-value
  """Loads the ViT parameters - adapted from proj/image_text/two_towers.py."""
  if isinstance(init_files, str):
    # A shortcut for a single file checkpoint of a two_towers model.
    init_files = {k: f"{init_files}:{k}" for k in ("img", "t")}
  else:
    init_files = {**init_files}  # Shallow copy because we'll pop stuff off.

  restored_params = {**init_params}

  img_init = init_files.pop("image", init_files.pop("img", None))
  if img_init:
    restored_params["img"] = importlib.import_module(
        f"big_vision.models.{model_cfg.image_model}"
    ).load(init_params["img"], img_init, model_cfg.image, **img_load_kw)

  t_init = init_files.pop("temperature", init_files.pop("t", None))
  if t_init:
    restored_params["t"] = utils.load_params(None, t_init)

  assert not init_files, (
      f"There's something unused left in `config.model_init`. You probably got "
      f"a typo. Here it is: {init_files}")

  return restored_params