|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tests for vision-text-transformer."""
|
|
from absl.testing import absltest
|
|
|
|
from big_vision.models.proj.uvim import vtt
|
|
import jax
|
|
import jax.numpy as jnp
|
|
import ml_collections
|
|
|
|
|
|
class VTTTest(absltest.TestCase):
|
|
|
|
def test_vtt_with_1_step(self):
|
|
model_config = ml_collections.ConfigDict(dict(
|
|
input_size=(224, 224),
|
|
patches={"size": (16, 16)},
|
|
num_heads=2,
|
|
num_layers=2,
|
|
mlp_dim=128,
|
|
emb_dim=64,
|
|
vocab_size=500))
|
|
batch_size, max_len = 8, 50
|
|
image = jnp.ones((batch_size, 224, 224, 3))
|
|
text = jnp.ones((batch_size, max_len), dtype=jnp.int32)
|
|
|
|
m = vtt.Model(**model_config)
|
|
variables = m.init(jax.random.PRNGKey(42), image, text)
|
|
self.assertCountEqual(variables.keys(), ["params"])
|
|
|
|
params = variables["params"]
|
|
out = m.apply({"params": params}, image, text)
|
|
expected_shape = (batch_size, max_len, model_config.vocab_size)
|
|
self.assertEqual(out.shape, expected_shape)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
absltest.main()
|
|
|