import flax import jax.numpy as jnp class Image(flax.linen.Module): out_units: int = 1024 @flax.linen.compact def __call__(self, x, training=False): x = flax.linen.Dropout(0.1)(x, deterministic=not training) return x class Text(flax.linen.Module): out_units: int = 1024 @flax.linen.compact def __call__(self, x, training=False): x = flax.linen.Dense(features=self.out_units)(x) res = flax.linen.silu(x) res = flax.linen.Dense(features=self.out_units)(res) res = flax.linen.Dropout(0.1)(res, deterministic=not training) x = x + res return x class CLIP(flax.linen.Module): out_units: int = 1024 def setup(self): self.image_enc = Image(self.out_units) self.text_enc = Text(self.out_units) self.logit_scale = self.variable( "params", "logit_scale", lambda x: jnp.log(10) * jnp.ones((1,)), None, ).value @flax.linen.compact def __call__(self, image, text, training=False): image_emb = self.image_enc(image, training=training) text_emb = self.text_enc(text, training=training) # Normalize image_emb = image_emb / jnp.linalg.norm(image_emb, axis=-1, keepdims=True) text_emb = text_emb / jnp.linalg.norm(text_emb, axis=-1, keepdims=True) image_sim = jnp.exp(self.logit_scale) * image_emb @ text_emb.T text_sim = jnp.exp(self.logit_scale) * text_emb @ image_emb.T return image_sim, text_sim def encode_text(self, text): text_emb = self.text_enc(text, training=False) return text_emb