Upload EVPRefer

Browse files

Files changed (5) hide show

config.json +12 -0
evpconfig.py +10 -0
model.py +320 -0
model.safetensors +3 -0
models.py +349 -0

config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "architectures": [
+    "EVPRefer"
+  ],
+  "auto_map": {
+    "AutoConfig": "evpconfig.EVPConfig",
+    "AutoModel": "model.EVPRefer"
+  },
+  "model_type": "EVP",
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2"
+}

evpconfig.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import PretrainedConfig
+class EVPConfig(PretrainedConfig):
+    model_type = "EVP"
+    def __init__(
+            self,
+            **kwargs,
+            ):
+        super().__init__(**kwargs)

model.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import sys
+from ldm.util import instantiate_from_config
+from transformers.models.clip.modeling_clip import CLIPTextModel
+from omegaconf import OmegaConf
+from lib.mask_predictor import SimpleDecoding
+from transformers import PreTrainedModel
+from .models import UNetWrapper, TextAdapterRefer
+from evpconfig import EVPConfig
+from transformers import CLIPTokenizer
+import torchvision.transforms as transforms
+def icnr(x, scale=2, init=nn.init.kaiming_normal_):
+    """
+    Checkerboard artifact free sub-pixel convolution
+    https://arxiv.org/abs/1707.02937
+    """
+    ni,nf,h,w = x.shape
+    ni2 = int(ni/(scale**2))
+    k = init(torch.zeros([ni2,nf,h,w])).transpose(0, 1)
+    k = k.contiguous().view(ni2, nf, -1)
+    k = k.repeat(1, 1, scale**2)
+    k = k.contiguous().view([nf,ni,h,w]).transpose(0, 1)
+    x.data.copy_(k)
+class PixelShuffle(nn.Module):
+    """
+    Real-Time Single Image and Video Super-Resolution
+    https://arxiv.org/abs/1609.05158
+    """
+    def __init__(self, n_channels, scale):
+        super(PixelShuffle, self).__init__()
+        self.conv = nn.Conv2d(n_channels, n_channels*(scale**2), kernel_size=1)
+        icnr(self.conv.weight)
+        self.shuf = nn.PixelShuffle(scale)
+        self.relu = nn.ReLU()
+    def forward(self,x):
+        x = self.shuf(self.relu(self.conv(x)))
+        return x
+class AttentionModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionModule, self).__init__()
+        # Convolutional Layers
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply convolutional layer
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionDownsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels, scale_factor=2):
+        super(AttentionDownsamplingModule, self).__init__()
+        # Spatial Attention
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Convolutional Layers
+        if scale_factor == 2:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        elif scale_factor == 4:
+            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        return x
+class AttentionUpsamplingModule(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(AttentionUpsamplingModule, self).__init__()
+        # Spatial Attention for outs[2]
+        self.spatial_attention = nn.Sequential(
+            nn.Conv2d(in_channels, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+        # Channel Attention for outs[2]
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(in_channels // 8, in_channels, kernel_size=1),
+            nn.Sigmoid()
+        )
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # Group Normalization
+        self.group_norm = nn.GroupNorm(20, out_channels)
+        # ReLU Activation
+        self.relu = nn.ReLU()
+        self.upscale = PixelShuffle(in_channels, 2)
+    def forward(self, x):
+        # Apply spatial attention
+        spatial_attention = self.spatial_attention(x)
+        x = x * spatial_attention
+        # Apply channel attention
+        channel_attention = self.channel_attention(x)
+        x = x * channel_attention
+        # Apply convolutional layers
+        x = self.conv1(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.group_norm(x)
+        x = self.relu(x)
+        # Upsample
+        x = self.upscale(x)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvLayer, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1),
+            nn.GroupNorm(20, out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        return x
+class InverseMultiAttentiveFeatureRefinement(nn.Module):
+    def __init__(self, in_channels_list):
+        super(InverseMultiAttentiveFeatureRefinement, self).__init__()
+        self.layer1 = AttentionModule(in_channels_list[0], in_channels_list[0])
+        self.layer2 = AttentionDownsamplingModule(in_channels_list[0], in_channels_list[0]//2, scale_factor = 2)
+        self.layer3 = ConvLayer(in_channels_list[0]//2 + in_channels_list[1], in_channels_list[1])
+        self.layer4 = AttentionDownsamplingModule(in_channels_list[1], in_channels_list[1]//2, scale_factor = 2)
+        self.layer5 = ConvLayer(in_channels_list[1]//2 + in_channels_list[2], in_channels_list[2])
+        self.layer6 = AttentionDownsamplingModule(in_channels_list[2], in_channels_list[2]//2, scale_factor = 2)
+        self.layer7 = ConvLayer(in_channels_list[2]//2 + in_channels_list[3], in_channels_list[3])
+        '''
+        self.layer8 = AttentionUpsamplingModule(in_channels_list[3], in_channels_list[3])
+        self.layer9 = ConvLayer(in_channels_list[2] + in_channels_list[3], in_channels_list[2])
+        self.layer10 = AttentionUpsamplingModule(in_channels_list[2], in_channels_list[2])
+        self.layer11 = ConvLayer(in_channels_list[1] + in_channels_list[2], in_channels_list[1])
+        self.layer12 = AttentionUpsamplingModule(in_channels_list[1], in_channels_list[1])
+        self.layer13 = ConvLayer(in_channels_list[0] + in_channels_list[1], in_channels_list[0])
+        '''
+    def forward(self, inputs):
+        x_c4, x_c3, x_c2, x_c1 = inputs
+        x_c4 = self.layer1(x_c4)
+        x_c4_3 = self.layer2(x_c4)
+        x_c3 = torch.cat([x_c4_3, x_c3], dim=1)
+        x_c3 = self.layer3(x_c3)
+        x_c3_2 = self.layer4(x_c3)
+        x_c2 = torch.cat([x_c3_2, x_c2], dim=1)
+        x_c2 = self.layer5(x_c2)
+        x_c2_1 = self.layer6(x_c2)
+        x_c1 = torch.cat([x_c2_1, x_c1], dim=1)
+        x_c1 = self.layer7(x_c1)
+        '''
+        x_c1_2 = self.layer8(x_c1)
+        x_c2 = torch.cat([x_c1_2, x_c2], dim=1)
+        x_c2 = self.layer9(x_c2)
+        x_c2_3 = self.layer10(x_c2)
+        x_c3 = torch.cat([x_c2_3, x_c3], dim=1)
+        x_c3 = self.layer11(x_c3)
+        x_c3_4 = self.layer12(x_c3)
+        x_c4 = torch.cat([x_c3_4, x_c4], dim=1)
+        x_c4 = self.layer13(x_c4)
+        '''
+        return [x_c4, x_c3, x_c2, x_c1]
+class EVPRefer(PreTrainedModel):
+    """Encoder Decoder segmentors.
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+    config_class = EVPConfig
+    def __init__(self, config,
+                 sd_path=None,
+                 base_size=512,
+                 token_embed_dim=768,
+                 neck_dim=[320,680,1320,1280],
+                 **args):
+        super().__init__(config)
+        config = OmegaConf.load('./v1-inference.yaml')
+        if os.path.exists(f'{sd_path}'):
+            config.model.params.ckpt_path = f'{sd_path}'
+        else:
+            config.model.params.ckpt_path = None
+        sd_model = instantiate_from_config(config.model)
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, base_size=base_size)
+        del sd_model.cond_stage_model
+        del self.encoder_vq.decoder
+        for param in self.encoder_vq.parameters():
+            param.requires_grad = True
+        self.text_adapter = TextAdapterRefer(text_dim=token_embed_dim)
+        self.classifier = SimpleDecoding(dims=neck_dim)
+        self.gamma = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
+        self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
+        self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        for param in self.clip_model.parameters():
+            param.requires_grad = True
+    def forward(self, img, sentences):
+        image_t = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(img)
+        shape = image_t.shape
+        img = torch.nn.functional.interpolate(image_t, (512,512), mode='bilinear', align_corners=True)
+        input_ids = self.tokenizer(text=sentences, truncation=True, max_length=40, return_length=True,
+                            return_overflowing_tokens=False, padding="max_length", return_tensors="pt")['input_ids'].to(image_t.device)
+        input_shape = img.shape[-2:]
+        latents = self.encoder_vq.encode(img).mode()
+        latents = latents / 4.7164
+        l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
+        c_crossattn = self.text_adapter(latents, l_feats, self.gamma) # NOTE: here the c_crossattn should be expand_dim as latents
+        t = torch.ones((img.shape[0],), device=img.device).long()
+        outs = self.unet(latents, t, c_crossattn=[c_crossattn])
+        outs = self.aggregation(outs)
+        x_c1, x_c2, x_c3, x_c4 = outs
+        x = self.classifier(x_c4, x_c3, x_c2, x_c1)
+        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=True)
+        pred = torch.nn.functional.interpolate(x, shape[2:], mode='bilinear', align_corners=True)
+        output_mask = pred.detach().cpu().argmax(1).data.numpy().squeeze()
+        return output_mask
+    def get_latent(self, x):
+        return self.encoder_vq.encode(x).mode()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07d8a7895e79a4defbf5445cada1a9973bcda51f7c73e4a91c878074a5f758f5
+size 4317946624

models.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from omegaconf import OmegaConf
+import torch as th
+import torch
+import math
+import abc
+from torch import nn, einsum
+from einops import rearrange, repeat
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from transformers import CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextConfig, CLIPTextModel, CLIPTextTransformer#, _expand_mask
+from inspect import isfunction
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def register_attention_control(model, controller):
+    def ca_forward(self, place_in_unet):
+        def forward(x, context=None, mask=None):
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = default(context, x)
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if exists(mask):
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                sim.masked_fill_(~mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            attn2 = rearrange(attn, '(b h) k c -> h b k c', h=h).mean(0)
+            controller(attn2, is_cross, place_in_unet)
+            out = einsum('b i j, b j d -> b i d', attn, v)
+            out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+            return self.to_out(out)
+        return forward
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.diffusion_model.named_children()
+    for net in sub_nets:
+        if "input_blocks" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "output_blocks" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "middle_block" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return 0
+    @abc.abstractmethod
+    def forward (self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        attn = self.forward(attn, is_cross, place_in_unet)
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [],  "mid_self": [],  "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= (self.max_size) ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item for item in self.step_store[key]] for key in self.step_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self, base_size=64, max_size=None):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.base_size = base_size
+        if max_size is None:
+            self.max_size = self.base_size // 2
+        else:
+            self.max_size = max_size
+def register_hier_output(model):
+    self = model.diffusion_model
+    from ldm.modules.diffusionmodules.util import checkpoint, timestep_embedding
+    def forward(x, timesteps=None, context=None, y=None,**kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            # import pdb; pdb.set_trace()
+            if context.shape[1]==2:
+                h = module(h, emb, context[:,0,:].unsqueeze(1))
+            else:
+                h = module(h, emb, context)
+            hs.append(h)
+        if context.shape[1]==2:
+            h = self.middle_block(h, emb, context[:,0,:].unsqueeze(1))
+        else:
+            h = self.middle_block(h, emb, context)
+        out_list = []
+        for i_out, module in enumerate(self.output_blocks):
+            h = th.cat([h, hs.pop()], dim=1)
+            if context.shape[1]==2:
+                h = module(h, emb, context[:,1,:].unsqueeze(1))
+            else:
+                h = module(h, emb, context)
+            if i_out in [1, 4, 7]:
+                out_list.append(h)
+        h = h.type(x.dtype)
+        out_list.append(h)
+        return out_list
+    self.forward = forward
+class UNetWrapper(nn.Module):
+    def __init__(self, unet, use_attn=True, base_size=512, max_attn_size=None, attn_selector='up_cross+down_cross') -> None:
+        super().__init__()
+        self.unet = unet
+        self.attention_store = AttentionStore(base_size=base_size // 8, max_size=max_attn_size)
+        self.size16 = base_size // 32
+        self.size32 = base_size // 16
+        self.size64 = base_size // 8
+        self.use_attn = use_attn
+        if self.use_attn:
+            register_attention_control(unet, self.attention_store)
+        register_hier_output(unet)
+        self.attn_selector = attn_selector.split('+')
+    def forward(self, *args, **kwargs):
+        if self.use_attn:
+            self.attention_store.reset()
+        out_list = self.unet(*args, **kwargs)
+        if self.use_attn:
+            avg_attn = self.attention_store.get_average_attention()
+            attn16, attn32, attn64 = self.process_attn(avg_attn)
+            out_list[1] = torch.cat([out_list[1], attn16], dim=1)
+            out_list[2] = torch.cat([out_list[2], attn32], dim=1)
+            if attn64 is not None:
+                out_list[3] = torch.cat([out_list[3], attn64], dim=1)
+        return out_list[::-1]
+    def process_attn(self, avg_attn):
+        attns = {self.size16: [], self.size32: [], self.size64: []}
+        for k in self.attn_selector:
+            for up_attn in avg_attn[k]:
+                size = int(math.sqrt(up_attn.shape[1]))
+                attns[size].append(rearrange(up_attn, 'b (h w) c -> b c h w', h=size))
+        attn16 = torch.stack(attns[self.size16]).mean(0)
+        attn32 = torch.stack(attns[self.size32]).mean(0)
+        if len(attns[self.size64]) > 0:
+            attn64 = torch.stack(attns[self.size64]).mean(0)
+        else:
+            attn64 = None
+        return attn16, attn32, attn64
+class TextAdapter(nn.Module):
+    def __init__(self, text_dim=768, hidden_dim=None):
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = text_dim
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        n_class, channel = texts.shape
+        bs = latents.shape[0]
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        texts = repeat(texts, 'n c -> b n c', b=bs)
+        return texts
+class TextAdapterRefer(nn.Module):
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.GELU(),
+            nn.Linear(text_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        return texts
+class TextAdapterDepth(nn.Module):
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.GELU(),
+            nn.Linear(text_dim, text_dim)
+        )
+    def forward(self, latents, texts, gamma):
+        # use the gamma to blend
+        n_sen, channel = texts.shape
+        bs = latents.shape[0]
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        texts = repeat(texts, 'n c -> n b c', b=1)
+        return texts
+class FrozenCLIPEmbedder(nn.Module):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, pool=True):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+        self.pool = pool
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        if self.pool:
+            z = outputs.pooler_output
+        else:
+            z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)