Spaces:

HarryLee
/

eCommerceImageCaptioning

Runtime error

App Files Files Community

HarryLee commited on May 24, 2022

Commit

e87479b

•

1 Parent(s): 45da9c7

Add tools support

Browse files

Files changed (43) hide show

models/__init__.py +1 -0
models/ofa/__init__.py +1 -0
models/ofa/ofa.py +434 -0
models/ofa/resnet.py +225 -0
models/ofa/unify_multihead_attention.py +518 -0
models/ofa/unify_transformer.py +1512 -0
models/ofa/unify_transformer_layer.py +542 -0
models/search.py +814 -0
models/sequence_generator.py +1053 -0
ofa_module/__init__.py +5 -0
run_scripts/caption/coco_eval.py +42 -0
run_scripts/caption/evaluate_caption.sh +35 -0
run_scripts/caption/evaluate_caption_base.sh +33 -0
run_scripts/caption/train_caption_stage1.sh +108 -0
run_scripts/caption/train_caption_stage1_base.sh +108 -0
run_scripts/caption/train_caption_stage1_el.sh +109 -0
run_scripts/caption/train_caption_stage1_el_db.sh +111 -0
run_scripts/caption/train_caption_stage2.sh +105 -0
run_scripts/caption/train_caption_stage2_base.sh +105 -0
tasks/__init__.py +6 -0
tasks/mm_tasks/__init__.py +5 -0
tasks/mm_tasks/caption.py +249 -0
tasks/mm_tasks/image_gen.py +329 -0
tasks/mm_tasks/refcoco.py +160 -0
tasks/mm_tasks/snli_ve.py +197 -0
tasks/mm_tasks/vqa_gen.py +278 -0
tasks/ofa_task.py +337 -0
utils/BPE/__init__.py +0 -0
utils/BPE/dict.txt +0 -0
utils/BPE/encoder.json +0 -0
utils/BPE/vocab.bpe +0 -0
utils/__init__.py +0 -0
utils/checkpoint_utils.py +875 -0
utils/cider/pyciderevalcap/__init__.py +1 -0
utils/cider/pyciderevalcap/cider/__init__.py +1 -0
utils/cider/pyciderevalcap/cider/cider.py +65 -0
utils/cider/pyciderevalcap/cider/cider_scorer.py +207 -0
utils/cider/pyciderevalcap/ciderD/__init__.py +1 -0
utils/cider/pyciderevalcap/ciderD/ciderD.py +58 -0
utils/cider/pyciderevalcap/ciderD/ciderD_scorer.py +222 -0
utils/eval_utils.py +349 -0
utils/transforms.py +513 -0
utils/trie.py +30 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .ofa import OFAModel, ofa_base_architecture, ofa_large_architecture, ofa_huge_architecture

models/ofa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .ofa import OFAModel, ofa_base_architecture, ofa_large_architecture, ofa_huge_architecture

models/ofa/ofa.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+"""
+OFA
+"""
+from typing import Optional
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from .unify_transformer import TransformerModel
+logger = logging.getLogger(__name__)
+@register_model("ofa")
+class OFAModel(TransformerModel):
+    __jit_unused_properties__ = ["supported_targets"]
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+        self.classification_heads = nn.ModuleDict()
+        if hasattr(self.encoder, "dictionary"):
+            self.eos: int = self.encoder.dictionary.eos()
+    @staticmethod
+    def add_args(parser):
+        super(OFAModel, OFAModel).add_args(parser)
+        parser.add_argument(
+            "--pooler-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability in the masked_lm pooler layers",
+        )
+        parser.add_argument(
+            "--pooler-classifier",
+            type=str,
+            choices=['mlp', 'linear'],
+            help="type of pooler classifier",
+        )
+        parser.add_argument(
+            "--pooler-activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use for pooler layer",
+        )
+        parser.add_argument(
+            "--spectral-norm-classification-head",
+            action="store_true",
+            help="Apply spectral normalization on the classification head",
+        )
+    @property
+    def supported_targets(self):
+        return {"self"}
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        patch_images: Optional[torch.Tensor] = None,
+        patch_images_2: Optional[torch.Tensor] = None,
+        patch_masks: Optional[torch.Tensor] = None,
+        code_masks: Optional[torch.Tensor] = None,
+        sample_patch_num: Optional[int] = None,
+        features_only: bool = False,
+        classification_head_name: Optional[str] = None,
+        token_embeddings: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            patch_images=patch_images,
+            patch_masks=patch_masks,
+            patch_images_2=patch_images_2,
+            token_embeddings=token_embeddings,
+            return_all_hiddens=return_all_hiddens,
+            sample_patch_num=sample_patch_num
+        )
+        x, extra = self.decoder(
+            prev_output_tokens,
+            code_masks=code_masks,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+        pad = self.encoder.padding_idx
+        if classification_head_name is not None:
+            prev_lengths = prev_output_tokens.ne(pad).sum(1)
+            gather_index = prev_lengths[:, None, None].expand(x.size(0), 1, x.size(2)) - 1
+            sentence_representation = x.gather(1, gather_index).squeeze()
+            if self.classification_heads[classification_head_name].use_two_images:
+                hidden_size = sentence_representation.size(1)
+                sentence_representation = sentence_representation.view(-1, hidden_size * 2)
+            for k, head in self.classification_heads.items():
+                # for torch script only supports iteration
+                if k == classification_head_name:
+                    x = head(sentence_representation)
+                    break
+        return x, extra
+    def register_embedding_tokens(self, ans2label_dict, src_dict, bpe):
+        """Register embedding tokens"""
+        logger.info("Registering embedding tokens")
+        self.ans_tensor_list = []
+        for i in range(len(ans2label_dict)):
+            ans = src_dict[-len(ans2label_dict)+i]
+            ans = ans[5:-1].replace('_', ' ')
+            ans_tensor = src_dict.encode_line(
+                line=bpe.encode(' {}'.format(ans.lower())),
+                add_if_not_exist=False,
+                append_eos=False
+            ).long()
+            self.ans_tensor_list.append(ans_tensor)
+    def register_classification_head(
+        self, name, num_classes=None, inner_dim=None, use_two_images=False, **kwargs
+    ):
+        """Register a classification head."""
+        logger.info("Registering classification head: {0}".format(name))
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = OFAClassificationHead(
+            input_dim=self.args.encoder_embed_dim,
+            inner_dim=inner_dim or self.args.encoder_embed_dim,
+            num_classes=num_classes,
+            activation_fn=self.args.pooler_activation_fn,
+            pooler_dropout=self.args.pooler_dropout,
+            pooler_classifier=self.args.pooler_classifier,
+            use_two_images=use_two_images,
+            do_spectral_norm=getattr(
+                self.args, "spectral_norm_classification_head", False
+            ),
+        )
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        prefix = name + "." if name != "" else ""
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads")
+            else self.classification_heads.keys()
+        )
+        # Handle new classification heads present in the state dict.
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[
+                prefix + "classification_heads." + head_name + ".out_proj.weight"
+            ].size(0)
+            inner_dim = state_dict[
+                prefix + "classification_heads." + head_name + ".dense.weight"
+            ].size(0)
+            if getattr(self.args, "load_checkpoint_heads", False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes
+                    != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim
+                    != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(
+                            head_name, k
+                        )
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+        def truncate_emb(key):
+            if key in state_dict:
+                state_dict[key] = state_dict[key][:-1, :]
+        # When finetuning on translation task, remove last row of
+        # embedding matrix that corresponds to mask_idx token.
+        loaded_dict_size = state_dict["encoder.embed_tokens.weight"].size(0)
+        if (
+            loaded_dict_size == len(self.encoder.dictionary) + 1
+            and "<mask>" not in self.encoder.dictionary
+        ):
+            truncate_emb("encoder.embed_tokens.weight")
+            truncate_emb("decoder.embed_tokens.weight")
+            truncate_emb("encoder.output_projection.weight")
+            truncate_emb("decoder.output_projection.weight")
+        if loaded_dict_size < len(self.encoder.dictionary):
+            num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size
+            embed_dim = state_dict["encoder.embed_tokens.weight"].size(1)
+            new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim)
+            if getattr(self, "ans_tensor_list", None):
+                assert len(new_lang_embed_to_add) == len(self.ans_tensor_list)
+                for i, ans_tensor in enumerate(self.ans_tensor_list):
+                    ans_embed = F.embedding(ans_tensor, state_dict["encoder.embed_tokens.weight"])
+                    ans_embed = ans_embed.sum(0) / ans_embed.size(0)
+                    new_lang_embed_to_add[i] = ans_embed
+            else:
+                nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim ** -0.5)
+            new_lang_embed_to_add = new_lang_embed_to_add.to(
+                dtype=state_dict["encoder.embed_tokens.weight"].dtype,
+            )
+            state_dict["encoder.embed_tokens.weight"] = torch.cat(
+                [state_dict["encoder.embed_tokens.weight"], new_lang_embed_to_add]
+            )
+            state_dict["decoder.embed_tokens.weight"] = torch.cat(
+                [state_dict["decoder.embed_tokens.weight"], new_lang_embed_to_add]
+            )
+            state_dict["decoder.output_projection.weight"] = torch.cat(
+                [state_dict["decoder.output_projection.weight"], new_lang_embed_to_add]
+            )
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, "classification_heads"):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+class OFAClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    def __init__(
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        activation_fn,
+        pooler_dropout,
+        pooler_classifier,
+        use_two_images=False,
+        do_spectral_norm=False,
+    ):
+        super().__init__()
+        self.pooler_classifier = pooler_classifier
+        self.use_two_images = use_two_images
+        input_dim = input_dim * 2 if use_two_images else input_dim
+        if pooler_classifier == "mlp":
+            self.dense = nn.Linear(input_dim, inner_dim)
+            self.activation_fn = utils.get_activation_fn(activation_fn)
+            self.dropout = nn.Dropout(p=pooler_dropout)
+            self.out_proj = nn.Linear(inner_dim, num_classes)
+        elif pooler_classifier == "linear":
+            self.dropout = nn.Dropout(p=pooler_dropout)
+            self.out_proj = nn.Linear(input_dim, num_classes)
+        else:
+            raise NotImplementedError
+        if do_spectral_norm:
+            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
+    def forward(self, features, **kwargs):
+        if self.pooler_classifier == 'mlp':
+            x = features
+            x = self.dropout(x)
+            x = self.dense(x)
+            x = self.activation_fn(x)
+            x = self.dropout(x)
+            x = self.out_proj(x)
+        elif self.pooler_classifier == 'linear':
+            x = features
+            x = self.dropout(x)
+            x = self.out_proj(x)
+        else:
+            raise NotImplementedError
+        return x
+@register_model_architecture("ofa", "ofa_large")
+def ofa_large_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1024)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
+    args.dropout = getattr(args, "dropout", 0.0)
+    args.max_target_positions = getattr(args, "max_target_positions", 1024)
+    args.max_source_positions = getattr(args, "max_source_positions", 1024)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", True
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", True)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
+    args.pooler_classifier = getattr(args, "pooler_classifier", "mlp")
+    args.resnet_drop_path_rate = getattr(args, "resnet_drop_path_rate", 0.0)
+    args.encoder_drop_path_rate = getattr(args, "encoder_drop_path_rate", 0.0)
+    args.decoder_drop_path_rate = getattr(args, "decoder_drop_path_rate", 0.0)
+    args.resnet_type = getattr(args, "resnet_type", "resnet152")
+    args.token_bucket_size = getattr(args, "token_bucket_size", 256)
+    args.image_bucket_size = getattr(args, "image_bucket_size", 42)
+    args.freeze_encoder_embedding = getattr(args, "freeze_encoder_embedding", False)
+    args.freeze_decoder_embedding = getattr(args, "freeze_decoder_embedding", False)
+    args.add_type_embedding = getattr(args, "add_type_embedding", True)
+    args.attn_scale_factor = getattr(args, "attn_scale_factor", 2)
+    args.code_image_size = getattr(args, "code_image_size", 128)
+    args.patch_layernorm_embedding = getattr(args, "patch_layernorm_embedding", True)
+    args.code_layernorm_embedding = getattr(args, "code_layernorm_embedding", True)
+    args.entangle_position_embedding = getattr(args, "entangle_position_embedding", False)
+    args.disable_entangle = getattr(args, "disable_entangle", False)
+    args.sync_bn = getattr(args, "sync_bn", False)
+    args.scale_attn = getattr(args, "scale_attn", False)
+    args.scale_fc = getattr(args, "scale_fc", False)
+    args.scale_heads = getattr(args, "scale_heads", False)
+    args.scale_resids = getattr(args, "scale_resids", False)
+@register_model_architecture("ofa", "ofa_base")
+def ofa_base_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 768)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12)
+    args.resnet_type = getattr(args, "resnet_type", "resnet101")
+    ofa_large_architecture(args)
+@register_model_architecture("ofa", "ofa_huge")
+def ofa_huge_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1280)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1280)
+    args.encoder_layers = getattr(args, "encoder_layers", 24)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.resnet_type = getattr(args, "resnet_type", "resnet152")
+    ofa_large_architecture(args)
+@register_model_architecture("ofa", "ofa_medium")
+def ofa_medium_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 512)
+    args.encoder_layers = getattr(args, "encoder_layers", 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.decoder_layers = getattr(args, "decoder_layers", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.resnet_type = getattr(args, "resnet_type", "resnet101")
+    ofa_large_architecture(args)
+@register_model_architecture("ofa", "ofa_tiny")
+def ofa_medium_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 256)
+    args.encoder_layers = getattr(args, "encoder_layers", 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.resnet_type = getattr(args, "resnet_type", "resnet50")
+    ofa_large_architecture(args)

models/ofa/resnet.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import torch
+import torch.nn as nn
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a.sh different form of dropout in a.sh separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a.sh layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        assert False
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None, drop_path_rate=0.0):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out = identity + self.drop_path(out)
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, layers, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None, drop_path_rate=0.0):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(Bottleneck, 64, layers[0], drop_path_rate=drop_path_rate)
+        self.layer2 = self._make_layer(Bottleneck, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0], drop_path_rate=drop_path_rate)
+        self.layer3 = self._make_layer(Bottleneck, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1], drop_path_rate=drop_path_rate)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.SyncBatchNorm, nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False, drop_path_rate=0.0):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, blocks)]
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer, drop_path_rate=dpr[i]))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        return x
+    def forward(self, x):
+        return self._forward_impl(x)

models/ofa/unify_multihead_attention.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import math
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor, nn
+from torch.nn import Parameter
+@with_incremental_state
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        scale_factor=2,
+        scale_heads=False
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = float(self.head_dim * scale_factor) ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        self.c_attn = nn.Parameter(torch.ones((self.num_heads,)), requires_grad=True) if scale_heads else None
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.onnx_trace = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        self_attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+        attn_bias: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        is_tpu = query.device.type == "xla"
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim, f"query dim {embed_dim} != {self.embed_dim}"
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+        if (
+            not self.onnx_trace
+            and not is_tpu  # don't use PyTorch version on TPUs
+            and incremental_state is None
+            and not static_kv
+            # A workaround for quantization to work. Otherwise JIT compilation
+            # treats bias in linear module as method.
+            and not torch.jit.is_scripting()
+            and self_attn_mask is None
+            and attn_bias is None
+        ):
+            assert key is not None and value is not None
+            return F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout_module.p,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training or self.dropout_module.apply_during_inference,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention and self_attn_mask is None:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    dim=1,
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_bias is not None:
+            attn_weights += attn_bias
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if self_attn_mask is not None:
+            self_attn_mask = self_attn_mask.unsqueeze(1).expand(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights += self_attn_mask.contiguous().view(bsz * self.num_heads, tgt_len, src_len)
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        if self.c_attn is not None:
+            attn = attn.view(tgt_len, bsz, self.num_heads, self.head_dim)
+            attn = torch.einsum('tbhd,h->tbhd', attn, self.c_attn)
+            attn = attn.reshape(tgt_len, bsz, self.embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [prev_key_padding_mask.float(), filler.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [filler.float(), key_padding_mask.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(
+                        0
+                    ) == new_order.size(0):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                keys_to_remove.append(k)
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value

models/ofa/unify_transformer.py ADDED Viewed

	@@ -0,0 +1,1512 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import math
+import random
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    BaseLayer,
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    SinusoidalPositionalEmbedding,
+    GradMultiply
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+from torch import Tensor
+from .unify_transformer_layer import TransformerEncoderLayer, TransformerDecoderLayer
+from .resnet import ResNet
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8)
+def BatchNorm2d(out_chan, momentum=0.1, eps=1e-3):
+    return nn.SyncBatchNorm.convert_sync_batchnorm(
+        nn.BatchNorm2d(out_chan, momentum=momentum, eps=eps)
+    )
+def make_token_bucket_position(bucket_size, max_position=DEFAULT_MAX_SOURCE_POSITIONS):
+    context_pos = torch.arange(max_position, dtype=torch.long)[:, None]
+    memory_pos = torch.arange(max_position, dtype=torch.long)[None, :]
+    relative_pos = context_pos - memory_pos
+    sign = torch.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = torch.where((relative_pos<mid) & (relative_pos > -mid), mid-1, torch.abs(relative_pos))
+    log_pos = torch.ceil(torch.log(abs_pos/mid)/math.log((max_position-1)/mid) * (mid-1)) + mid
+    log_pos = log_pos.int()
+    bucket_pos = torch.where(abs_pos.le(mid), relative_pos, log_pos*sign).long()
+    return bucket_pos + bucket_size - 1
+def make_image_bucket_position(bucket_size, num_relative_distance):
+    coords_h = torch.arange(bucket_size)
+    coords_w = torch.arange(bucket_size)
+    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+    coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+    relative_coords[:, :, 0] += bucket_size - 1  # shift to start from 0
+    relative_coords[:, :, 1] += bucket_size - 1
+    relative_coords[:, :, 0] *= 2 * bucket_size - 1
+    relative_position_index = torch.zeros(size=(bucket_size * bucket_size + 1,) * 2, dtype=relative_coords.dtype)
+    relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+    relative_position_index[0, 0:] = num_relative_distance - 3
+    relative_position_index[0:, 0] = num_relative_distance - 2
+    relative_position_index[0, 0] = num_relative_distance - 1
+    return relative_position_index
+@register_model("unify_transformer")
+class TransformerModel(FairseqEncoderDecoderModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_.
+    Args:
+        encoder (TransformerEncoder): the encoder
+        decoder (TransformerDecoder): the decoder
+    The Transformer model provides the following named architectures and
+    command-line arguments:
+    .. argparse::
+        :ref: fairseq.models.transformer_parser
+        :prog:
+    """
+    def __init__(self, args, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.args = args
+        self.supports_align_args = True
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN.')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+                            help='decoder output dimension (extra linear layer '
+                                 'if different from decoder embed dim')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion'),
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--layernorm-embedding', action='store_true',
+                            help='add layernorm to embedding')
+        parser.add_argument('--no-scale-embedding', action='store_true',
+                            help='if True, dont scale embeddings')
+        parser.add_argument('--checkpoint-activations', action='store_true',
+                            help='checkpoint activations at each layer, which saves GPU '
+                                 'memory usage at the cost of some additional compute')
+        parser.add_argument('--offload-activations', action='store_true',
+                            help='checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations.')
+        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
+        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+                            help='do not perform cross-attention')
+        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+                            help='perform cross+self-attention')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for encoder')
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for decoder')
+        parser.add_argument('--encoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        parser.add_argument('--decoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+                            help='iterative PQ quantization noise at training time')
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+                            help='block size of quantization noise at training time')
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+                            help='scalar quantization noise and scalar quantization at training time')
+        # args for Fully Sharded Data Parallel (FSDP) training
+        parser.add_argument(
+            '--min-params-to-wrap', type=int, metavar='D', default=DEFAULT_MIN_PARAMS_TO_WRAP,
+            help=(
+                'minimum number of params for a layer to be wrapped with FSDP() when '
+                'training with --ddp-backend=fully_sharded. Smaller values will '
+                'improve memory efficiency, but may make torch.distributed '
+                'communication less efficient due to smaller input sizes. This option '
+                'is set to 0 (i.e., always wrap) when --checkpoint-activations or '
+                '--offload-activations are passed.'
+            )
+        )
+        parser.add_argument('--resnet-drop-path-rate', type=float,
+                            help='resnet drop path rate')
+        parser.add_argument('--encoder-drop-path-rate', type=float,
+                            help='encoder drop path rate')
+        parser.add_argument('--decoder-drop-path-rate', type=float,
+                            help='encoder drop path rate')
+        parser.add_argument('--token-bucket-size', type=int,
+                            help='token bucket size')
+        parser.add_argument('--image-bucket-size', type=int,
+                            help='image bucket size')
+        parser.add_argument('--attn-scale-factor', type=float,
+                            help='attention scale factor')
+        parser.add_argument('--freeze-resnet', action='store_true',
+                            help='freeze resnet')
+        parser.add_argument('--freeze-encoder-embedding', action='store_true',
+                            help='freeze encoder token embedding')
+        parser.add_argument('--freeze-decoder-embedding', action='store_true',
+                            help='freeze decoder token embedding')
+        parser.add_argument('--add-type-embedding', action='store_true',
+                            help='add source/region/patch type embedding')
+        parser.add_argument('--resnet-type', choices=['resnet50', 'resnet101', 'resnet152'],
+                            help='resnet type')
+        parser.add_argument('--resnet-model-path', type=str, metavar='STR',
+                            help='path to load resnet')
+        parser.add_argument('--code-image-size', type=int,
+                            help='code image size')
+        parser.add_argument('--patch-layernorm-embedding', action='store_true',
+                            help='add layernorm to patch embedding')
+        parser.add_argument('--code-layernorm-embedding', action='store_true',
+                            help='add layernorm to code embedding')
+        parser.add_argument('--entangle-position-embedding', action='store_true',
+                            help='entangle position embedding')
+        parser.add_argument('--disable-entangle', action='store_true',
+                            help='disable entangle')
+        parser.add_argument('--sync-bn', action='store_true',
+                            help='sync batchnorm')
+        parser.add_argument('--scale-attn', action='store_true',
+                            help='scale attn')
+        parser.add_argument('--scale-fc', action='store_true',
+                            help='scale fc')
+        parser.add_argument('--scale-heads', action='store_true',
+                            help='scale heads')
+        parser.add_argument('--scale-resids', action='store_true',
+                            help='scale resids')
+        # fmt: on
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_architecture(args)
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError("--share-all-embeddings requires a joined dictionary")
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = cls.build_embedding(
+                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = cls.build_embedding(
+                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = cls.build_embedding(
+                args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+        if getattr(args, "freeze_encoder_embedding", False):
+            encoder_embed_tokens.weight.requires_grad = False
+        if getattr(args, "freeze_decoder_embedding", False):
+            decoder_embed_tokens.weight.requires_grad = False
+        if getattr(args, "offload_activations", False):
+            args.checkpoint_activations = True  # offloading implies checkpointing
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        if not args.share_all_embeddings:
+            min_params_to_wrap = getattr(
+                args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP
+            )
+            # fsdp_wrap is a no-op when --ddp-backend != fully_sharded
+            encoder = fsdp_wrap(encoder, min_num_params=min_params_to_wrap)
+            decoder = fsdp_wrap(decoder, min_num_params=min_params_to_wrap)
+        return cls(args, encoder, decoder)
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        emb = Embedding(num_embeddings, embed_dim, padding_idx)
+        # if provided, load from preloaded dictionaries
+        if path:
+            embed_dict = utils.parse_embedding(path)
+            utils.load_embedding(embed_dict, dictionary, emb)
+        return emb
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerEncoder(args, src_dict, embed_tokens)
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, "no_cross_attention", False),
+        )
+    # TorchScript doesn't support optional arguments with variable length (**kwargs).
+    # Current workaround is to add union of all arguments in child classes.
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        return_all_hiddens: bool = True,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Run the forward pass for an encoder-decoder model.
+        Copied from the base class, but without ``**kwargs``,
+        which are not supported by TorchScript.
+        """
+        encoder_out = self.encoder(
+            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+        return decoder_out
+    # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
+    # I rewrite the get_normalized_probs from Base Class to call the
+    # helper function in the Base Class.
+    @torch.jit.export
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+class TransformerEncoder(FairseqEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+    def __init__(self, args, dictionary, embed_tokens):
+        self.args = args
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.encoder_layerdrop = args.encoder_layerdrop
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+        self.num_attention_heads = args.encoder_attention_heads
+        self.embed_tokens = embed_tokens
+        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+        if getattr(args, "add_type_embedding", False):
+            self.type_embedding = Embedding(2, embed_dim, padding_idx=None)
+        else:
+            self.type_embedding = None
+        if getattr(args, "sync_bn", False):
+            norm_layer = BatchNorm2d
+        else:
+            norm_layer = None
+        if args.resnet_type == 'resnet101':
+            self.embed_images = ResNet([3, 4, 23], norm_layer=norm_layer, drop_path_rate=args.resnet_drop_path_rate)
+        elif args.resnet_type == 'resnet152':
+            self.embed_images = ResNet([3, 8, 36], norm_layer=norm_layer, drop_path_rate=args.resnet_drop_path_rate)
+        elif args.resnet_type == 'resnet50':
+            self.embed_images = ResNet([3, 4, 6], norm_layer=norm_layer, drop_path_rate=args.resnet_drop_path_rate)
+        else:
+            raise NotImplementedError
+        self.image_proj = Linear(1024, embed_dim)
+        if getattr(args, "resnet_model_path", None):
+            print("load resnet {}".format(args.resnet_model_path))
+            resnet_state_dict = torch.load(self.args.resnet_model_path)
+            self.embed_images.load_state_dict(resnet_state_dict)
+        if getattr(args, "patch_layernorm_embedding", False):
+            self.patch_layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.patch_layernorm_embedding = None
+        self.embed_positions = Embedding(args.max_source_positions + 2, embed_dim)
+        self.embed_image_positions = Embedding(args.image_bucket_size ** 2 + 1, embed_dim)
+        self.pos_ln = LayerNorm(embed_dim)
+        self.image_pos_ln = LayerNorm(embed_dim)
+        self.pos_scaling = float(embed_dim / args.encoder_attention_heads * args.attn_scale_factor) ** -0.5
+        self.pos_q_linear = nn.Linear(embed_dim, embed_dim)
+        self.pos_k_linear = nn.Linear(embed_dim, embed_dim)
+        if not args.adaptive_input and args.quant_noise_pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+        if self.encoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        dpr = [x.item() for x in torch.linspace(0, args.encoder_drop_path_rate, args.encoder_layers)]
+        self.layers.extend(
+            [self.build_encoder_layer(args, drop_path_rate=dpr[i]) for i in range(args.encoder_layers)]
+        )
+        self.num_layers = len(self.layers)
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+        token_bucket_size = args.token_bucket_size
+        token_num_rel_dis = 2 * token_bucket_size - 1
+        token_rp_bucket = make_token_bucket_position(token_bucket_size)
+        self.token_rel_pos_table_list = nn.ModuleList(
+            [Embedding(token_num_rel_dis, self.num_attention_heads, zero_init=True) for _ in range(args.encoder_layers)]
+        )
+        image_bucket_size = args.image_bucket_size
+        image_num_rel_dis = (2 * image_bucket_size - 1) * (2 * image_bucket_size - 1) + 3
+        image_rp_bucket = make_image_bucket_position(image_bucket_size, image_num_rel_dis)
+        self.image_rel_pos_table_list = nn.ModuleList(
+            [Embedding(image_num_rel_dis, self.num_attention_heads, zero_init=True) for _ in range(args.encoder_layers)]
+        )
+        self.register_buffer("token_rp_bucket", token_rp_bucket)
+        self.register_buffer("image_rp_bucket", image_rp_bucket)
+        self.entangle_position_embedding = args.entangle_position_embedding
+    def train(self, mode=True):
+        super(TransformerEncoder, self).train(mode)
+        if getattr(self.args, "freeze_resnet", False):
+            for m in self.embed_images.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    m.weight.requires_grad = False
+                    m.bias.requires_grad = False
+    def build_encoder_layer(self, args, drop_path_rate=0.0):
+        layer = TransformerEncoderLayer(args, drop_path_rate=drop_path_rate)
+        checkpoint = getattr(args, "checkpoint_activations", False)
+        if checkpoint:
+            offload_to_cpu = getattr(args, "offload_activations", False)
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = (
+            getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP)
+            if not checkpoint else 0
+        )
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+    def get_rel_pos_bias(self, x, idx):
+        seq_len = x.size(1)
+        rp_bucket = self.token_rp_bucket[:seq_len, :seq_len]
+        values = F.embedding(rp_bucket, self.token_rel_pos_table_list[idx].weight)
+        values = values.unsqueeze(0).expand(x.size(0), -1, -1, -1)
+        values = values.permute([0, 3, 1, 2])
+        return values.contiguous()
+    def get_image_rel_pos_bias(self, image_position_ids, idx):
+        bsz, seq_len = image_position_ids.shape
+        rp_bucket_size = self.image_rp_bucket.size(1)
+        rp_bucket = self.image_rp_bucket.unsqueeze(0).expand(
+            bsz, rp_bucket_size, rp_bucket_size
+        ).gather(1, image_position_ids[:, :, None].expand(bsz, seq_len, rp_bucket_size)
+        ).gather(2, image_position_ids[:, None, :].expand(bsz, seq_len, seq_len))
+        values = F.embedding(rp_bucket, self.image_rel_pos_table_list[idx].weight)
+        values = values.permute(0, 3, 1, 2)
+        return values
+    def get_patch_images_info(self, patch_images, sample_patch_num, device):
+        image_embed = self.embed_images(patch_images)
+        h, w = image_embed.shape[-2:]
+        image_num_patches = h * w
+        image_padding_mask = patch_images.new_zeros((patch_images.size(0), image_num_patches)).bool()
+        image_position_idx = torch.arange(w).unsqueeze(0).expand(h, w) + \
+                             torch.arange(h).unsqueeze(1) * self.args.image_bucket_size + 1
+        image_position_idx = image_position_idx.view(-1).to(device)
+        image_position_ids = image_position_idx[None, :].expand(patch_images.size(0), image_num_patches)
+        image_embed = image_embed.flatten(2).transpose(1, 2)
+        if sample_patch_num is not None:
+            patch_orders = [
+                random.sample(range(image_num_patches), k=sample_patch_num)
+                for _ in range(patch_images.size(0))
+            ]
+            patch_orders = torch.LongTensor(patch_orders).to(device)
+            image_embed = image_embed.gather(
+                1, patch_orders.unsqueeze(2).expand(-1, -1, image_embed.size(2))
+            )
+            image_num_patches = sample_patch_num
+            image_padding_mask = image_padding_mask.gather(1, patch_orders)
+            image_position_ids = image_position_ids.gather(1, patch_orders)
+        image_pos_embed = self.embed_image_positions(image_position_ids)
+        return image_embed, image_num_patches, image_padding_mask, image_position_ids, image_pos_embed
+    def forward_embedding(
+        self,
+        src_tokens,
+        image_embed: Optional[torch.Tensor] = None,
+        image_embed_2: Optional[torch.Tensor] = None,
+        token_embedding: Optional[torch.Tensor] = None,
+        pos_embed: Optional[torch.Tensor] = None,
+        image_pos_embed: Optional[torch.Tensor] = None,
+        image_pos_embed_2: Optional[torch.Tensor] = None
+    ):
+        # embed tokens and positions
+        if token_embedding is None:
+            token_embedding = self.embed_tokens(src_tokens)
+        x = embed = self.embed_scale * token_embedding
+        if self.entangle_position_embedding and pos_embed is not None:
+            x += pos_embed
+        if self.type_embedding is not None:
+            x += self.type_embedding(src_tokens.new_zeros(x.size()[:2]))
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        x = self.dropout_module(x)
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+        # embed raw images
+        if image_embed is not None:
+            image_embed = self.image_proj(image_embed)
+            image_x = image_embed = self.embed_scale * image_embed
+            if self.entangle_position_embedding and image_pos_embed is not None:
+                image_x += image_pos_embed
+            if self.type_embedding is not None:
+                image_x += self.type_embedding(src_tokens.new_ones(image_x.size()[:2]))
+            if self.patch_layernorm_embedding is not None:
+                image_x = self.patch_layernorm_embedding(image_x)
+            image_x = self.dropout_module(image_x)
+            if self.quant_noise is not None:
+                image_x = self.quant_noise(image_x)
+            x = torch.cat([image_x, x], dim=1)
+            embed = torch.cat([image_embed, embed], dim=1)
+        if image_embed_2 is not None:
+            assert self.type_embedding is not None
+            image_embed_2 = self.image_proj(image_embed_2)
+            image_x_2 = image_embed_2 = self.embed_scale * image_embed_2
+            if self.entangle_position_embedding and image_pos_embed_2 is not None:
+                image_x_2 += image_pos_embed_2
+            if self.type_embedding is not None:
+                image_x_2 += self.type_embedding(src_tokens.new_full(image_x_2.size()[:2], fill_value=2))
+            if self.patch_layernorm_embedding is not None:
+                image_x_2 = self.patch_layernorm_embedding(image_x_2)
+            image_x_2 = self.dropout_module(image_x_2)
+            if self.quant_noise is not None:
+                image_x_2 = self.quant_noise(image_x_2)
+            x = torch.cat([image_x_2, x], dim=1)
+            embed = torch.cat([image_embed_2, embed], dim=1)
+        return x, embed
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        patch_images: Optional[torch.Tensor] = None,
+        patch_images_2: Optional[torch.Tensor] = None,
+        patch_masks: Optional[torch.Tensor] = None,
+        code_masks: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        sample_patch_num: Optional[int] = None
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        return self.forward_scriptable(src_tokens,
+                                       src_lengths,
+                                       patch_images,
+                                       patch_images_2,
+                                       patch_masks,
+                                       return_all_hiddens,
+                                       token_embeddings,
+                                       sample_patch_num)
+    # TorchScript doesn't support super() method so that the scriptable Subclass
+    # can't access the base class model in Torchscript.
+    # Current workaround is to add a helper function with different name and
+    # call the helper function from scriptable Subclass.
+    def forward_scriptable(
+        self,
+        src_tokens,
+        src_lengths,
+        patch_images: Optional[torch.Tensor] = None,
+        patch_images_2: Optional[torch.Tensor] = None,
+        patch_masks: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        sample_patch_num: Optional[int] = None
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        image_embed = None
+        image_embed_2 = None
+        image_pos_embed = None
+        image_pos_embed_2 = None
+        if patch_images is not None:
+            image_embed, image_num_patches, image_padding_mask, image_position_ids, image_pos_embed = \
+                self.get_patch_images_info(patch_images, sample_patch_num, src_tokens.device)
+            image_padding_mask[~patch_masks] = True
+        if patch_images_2 is not None:
+            image_embed_2, image_num_patches_2, image_padding_mask_2, image_position_ids_2, image_pos_embed_2 = \
+                self.get_patch_images_info(patch_images_2, sample_patch_num, src_tokens.device)
+            image_padding_mask_2[~patch_masks] = True
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        if patch_images is not None:
+            encoder_padding_mask = torch.cat([image_padding_mask, encoder_padding_mask], dim=1)
+        if patch_images_2 is not None:
+            encoder_padding_mask = torch.cat([image_padding_mask_2, encoder_padding_mask], dim=1)
+        has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any())
+        pos_embed = self.embed_positions(utils.new_arange(src_tokens))
+        x, encoder_embedding = self.forward_embedding(
+            src_tokens, image_embed, image_embed_2, token_embeddings,
+            pos_embed, image_pos_embed, image_pos_embed_2
+        )
+        # account for padding while computing the representation
+        if has_pads:
+            x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        pos_embed = self.pos_ln(pos_embed)
+        if patch_images is not None:
+            image_pos_embed = self.image_pos_ln(image_pos_embed)
+            pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1)
+        if patch_images_2 is not None:
+            image_pos_embed_2 = self.image_pos_ln(image_pos_embed_2)
+            pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1)
+        pos_q = self.pos_q_linear(pos_embed).view(
+            x.size(1), x.size(0), self.num_attention_heads, -1
+        ).transpose(1, 2) * self.pos_scaling
+        pos_k = self.pos_k_linear(pos_embed).view(
+            x.size(1), x.size(0), self.num_attention_heads, -1
+        ).transpose(1, 2)
+        abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+        encoder_states = []
+        if return_all_hiddens:
+            encoder_states.append(x)
+        # encoder layers
+        for idx, layer in enumerate(self.layers):
+            self_attn_bias = abs_pos_bias.clone()
+            self_attn_bias[:, :, -src_tokens.size(1):, -src_tokens.size(1):] += self.get_rel_pos_bias(src_tokens, idx)
+            if patch_images_2 is not None:
+                self_attn_bias[:, :, :image_num_patches_2, :image_num_patches_2] += \
+                    self.get_image_rel_pos_bias(image_position_ids_2, idx)
+                self_attn_bias[:, :, image_num_patches_2:image_num_patches_2+image_num_patches, image_num_patches_2:image_num_patches_2+image_num_patches] += \
+                    self.get_image_rel_pos_bias(image_position_ids, idx)
+            elif patch_images is not None:
+                self_attn_bias[:, :, :x.size(0) - src_tokens.size(1), :x.size(0) - src_tokens.size(1)] += \
+                    self.get_image_rel_pos_bias(image_position_ids, idx)
+            self_attn_bias = self_attn_bias.reshape(-1, x.size(0), x.size(0))
+            x = layer(
+                x, encoder_padding_mask=encoder_padding_mask if has_pads else None, self_attn_bias=self_attn_bias
+            )
+            if return_all_hiddens:
+                assert encoder_states is not None
+                encoder_states.append(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
+        # `forward` so we use a dictionary instead.
+        # TorchScript does not support mixed values so the values are all lists.
+        # The empty list is equivalent to None.
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+            "position_embeddings": [pos_embed],  # B x T x C
+        }
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """
+        Reorder encoder output according to *new_order*.
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        if len(encoder_out["encoder_out"]) == 0:
+            new_encoder_out = []
+        else:
+            new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)]
+        if len(encoder_out["encoder_padding_mask"]) == 0:
+            new_encoder_padding_mask = []
+        else:
+            new_encoder_padding_mask = [
+                encoder_out["encoder_padding_mask"][0].index_select(0, new_order)
+            ]
+        if len(encoder_out["encoder_embedding"]) == 0:
+            new_encoder_embedding = []
+        else:
+            new_encoder_embedding = [
+                encoder_out["encoder_embedding"][0].index_select(0, new_order)
+            ]
+        if len(encoder_out["src_tokens"]) == 0:
+            new_src_tokens = []
+        else:
+            new_src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)]
+        if len(encoder_out["src_lengths"]) == 0:
+            new_src_lengths = []
+        else:
+            new_src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)]
+        if len(encoder_out["position_embeddings"]) == 0:
+            new_position_embeddings = []
+        else:
+            new_position_embeddings = [(encoder_out["position_embeddings"][0]).index_select(0, new_order)]
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": new_src_tokens,  # B x T
+            "src_lengths": new_src_lengths,  # B x 1
+            "position_embeddings": new_position_embeddings,  # B x T x C
+        }
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return self.max_source_positions
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                print("deleting {0}".format(weights_key))
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+        for i in range(self.num_layers):
+            # update layer norms
+            self.layers[i].upgrade_state_dict_named(
+                state_dict, "{}.layers.{}".format(name, i)
+            )
+        # version_key = "{}.version".format(name)
+        # if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
+        #     # earlier checkpoints did not normalize after the stack of layers
+        #     self.layer_norm = None
+        #     self.normalize = False
+        #     state_dict[version_key] = torch.Tensor([1])
+        prefix = name + "." if name != "" else ""
+        for param_name, param_tensor in self.state_dict().items():
+            if (prefix + param_name) not in state_dict:
+                state_dict[prefix + param_name] = self.state_dict()[param_name]
+        if len(state_dict["encoder.embed_image_positions.weight"]) < len(self.state_dict()["embed_image_positions.weight"]):
+            num_posids_to_add = len(self.state_dict()["embed_image_positions.weight"]) - len(state_dict["encoder.embed_image_positions.weight"])
+            embed_dim = state_dict["encoder.embed_image_positions.weight"].size(1)
+            new_pos_embed_to_add = torch.zeros(num_posids_to_add, embed_dim)
+            nn.init.normal_(new_pos_embed_to_add, mean=0, std=embed_dim ** -0.5)
+            new_pos_embed_to_add = new_pos_embed_to_add.to(
+                dtype=state_dict["encoder.embed_image_positions.weight"].dtype,
+            )
+            state_dict["encoder.embed_image_positions.weight"] = torch.cat(
+                [state_dict["encoder.embed_image_positions.weight"], new_pos_embed_to_add]
+            )
+        return state_dict
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        self.args = args
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        self._future_mask = torch.empty(0)
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.decoder_layerdrop = args.decoder_layerdrop
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+        self.num_attention_heads = args.decoder_attention_heads
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.embed_dim = embed_dim
+        self.output_embed_dim = args.decoder_output_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+        self.embed_tokens = embed_tokens
+        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+        if not args.adaptive_input and args.quant_noise_pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+        self.window_size = args.code_image_size // 8
+        self.embed_positions = Embedding(args.max_target_positions + 2, embed_dim)
+        self.embed_image_positions = Embedding(args.image_bucket_size ** 2 + 1, embed_dim)
+        self.pos_ln = LayerNorm(embed_dim)
+        self.image_pos_ln = LayerNorm(embed_dim)
+        self.pos_scaling = float(embed_dim / self.num_attention_heads * args.attn_scale_factor) ** -0.5
+        self.self_pos_q_linear = nn.Linear(embed_dim, embed_dim)
+        self.self_pos_k_linear = nn.Linear(embed_dim, embed_dim)
+        self.cross_pos_q_linear = nn.Linear(embed_dim, embed_dim)
+        self.cross_pos_k_linear = nn.Linear(embed_dim, embed_dim)
+        if getattr(args, "code_layernorm_embedding", False):
+            self.code_layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.code_layernorm_embedding = None
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+        if self.decoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        dpr = [x.item() for x in torch.linspace(0, args.decoder_drop_path_rate, args.decoder_layers)]
+        self.layers.extend(
+            [
+                self.build_decoder_layer(args, no_encoder_attn, drop_path_rate=dpr[i])
+                for i in range(args.decoder_layers)
+            ]
+        )
+        self.num_layers = len(self.layers)
+        if args.decoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+        self.project_out_dim = (
+            Linear(embed_dim, self.output_embed_dim, bias=False)
+            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
+            else None
+        )
+        self.adaptive_softmax = None
+        self.output_projection = output_projection
+        if self.output_projection is None:
+            self.build_output_projection(args, dictionary, embed_tokens)
+        token_bucket_size = args.token_bucket_size
+        token_num_rel_dis = 2 * token_bucket_size - 1
+        token_rp_bucket = make_token_bucket_position(token_bucket_size)
+        self.token_rel_pos_table_list = nn.ModuleList(
+            [Embedding(token_num_rel_dis, self.num_attention_heads, zero_init=True) for _ in range(args.decoder_layers)]
+        )
+        image_bucket_size = args.image_bucket_size
+        image_num_rel_dis = (2 * image_bucket_size - 1) * (2 * image_bucket_size - 1) + 3
+        image_rp_bucket = make_image_bucket_position(image_bucket_size, image_num_rel_dis)
+        image_position_idx = torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size) + \
+                             torch.arange(self.window_size).unsqueeze(1) * image_bucket_size + 1
+        image_position_idx = torch.cat([torch.tensor([0]), image_position_idx.view(-1)])
+        image_position_idx = torch.cat([image_position_idx, torch.tensor([1024] * 768)])
+        self.image_rel_pos_table_list = nn.ModuleList(
+            [Embedding(image_num_rel_dis, self.num_attention_heads, zero_init=True) for _ in range(args.decoder_layers)]
+        )
+        self.register_buffer("token_rp_bucket", token_rp_bucket)
+        self.register_buffer("image_rp_bucket", image_rp_bucket)
+        self.register_buffer("image_position_idx", image_position_idx)
+        self.entangle_position_embedding = args.entangle_position_embedding
+    def build_output_projection(self, args, dictionary, embed_tokens):
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                self.output_embed_dim,
+                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif self.share_input_output_embed:
+            self.output_projection = nn.Linear(
+                self.embed_tokens.weight.shape[1],
+                self.embed_tokens.weight.shape[0],
+                bias=False,
+            )
+            self.output_projection.weight = self.embed_tokens.weight
+        else:
+            self.output_projection = nn.Linear(
+                self.output_embed_dim, len(dictionary), bias=False
+            )
+            nn.init.normal_(
+                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
+            )
+        num_base_layers = getattr(args, "base_layers", 0)
+        for i in range(num_base_layers):
+            self.layers.insert(((i+1) * args.decoder_layers) // (num_base_layers + 1), BaseLayer(args))
+    def build_decoder_layer(self, args, no_encoder_attn=False, drop_path_rate=0.0):
+        layer = TransformerDecoderLayer(args, no_encoder_attn, drop_path_rate=drop_path_rate)
+        checkpoint = getattr(args, "checkpoint_activations", False)
+        if checkpoint:
+            offload_to_cpu = getattr(args, "offload_activations", False)
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = (
+            getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP)
+            if not checkpoint else 0
+        )
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+    def get_rel_pos_bias(self, x, idx):
+        seq_len = x.size(1)
+        rp_bucket = self.token_rp_bucket[:seq_len, :seq_len]
+        values = F.embedding(rp_bucket, self.token_rel_pos_table_list[idx].weight)
+        values = values.permute([2, 0, 1])
+        return values.contiguous()
+    def get_image_rel_pos_bias(self, x, idx):
+        seq_len = x.size(1)
+        image_position_idx = self.image_position_idx[:seq_len]
+        rp_bucket = self.image_rp_bucket[image_position_idx][:, image_position_idx]
+        values = F.embedding(rp_bucket, self.image_rel_pos_table_list[idx].weight)
+        values = values.permute(2, 0, 1)
+        return values
+    def get_pos_info(self, tokens, tgt_pos_embed, src_pos_embed=None, use_image=False):
+        batch_size = tokens.size(0)
+        tgt_len = tokens.size(1)
+        tgt_pos_embed = self.image_pos_ln(tgt_pos_embed) if use_image else self.pos_ln(tgt_pos_embed)
+        if src_pos_embed is not None:
+            src_len = src_pos_embed.size(1)
+            pos_q = self.cross_pos_q_linear(tgt_pos_embed).view(
+                batch_size, tgt_len, self.num_attention_heads, -1
+            ).transpose(1, 2) * self.pos_scaling
+            pos_k = self.cross_pos_k_linear(src_pos_embed).view(
+                batch_size, src_len, self.num_attention_heads, -1
+            ).transpose(1, 2)
+        else:
+            src_len = tgt_pos_embed.size(1)
+            pos_q = self.self_pos_q_linear(tgt_pos_embed).view(
+                batch_size, tgt_len, self.num_attention_heads, -1
+            ).transpose(1, 2) * self.pos_scaling
+            pos_k = self.self_pos_k_linear(tgt_pos_embed).view(
+                batch_size, src_len, self.num_attention_heads, -1
+            ).transpose(1, 2)
+        abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+        return abs_pos_bias
+    def forward(
+        self,
+        prev_output_tokens,
+        code_masks: Optional[torch.Tensor] = None,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            code_masks=code_masks,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+    def extract_features(
+        self,
+        prev_output_tokens,
+        code_masks: Optional[torch.Tensor],
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        return self.extract_features_scriptable(
+            prev_output_tokens,
+            code_masks,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+    """
+    A scriptable subclass of this class has an extract_features method and calls
+    super().extract_features, but super() is not supported in torchscript. A copy of
+    this function is made to be used in the subclass instead.
+    """
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens,
+        code_masks: Optional[torch.Tensor],
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        bs, slen = prev_output_tokens.size()
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+        enc: Optional[Tensor] = None
+        padding_mask: Optional[Tensor] = None
+        if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
+            enc = encoder_out["encoder_out"][0]
+            assert (
+                enc.size()[1] == bs
+            ), f"Expected enc.shape == (t, {bs}, c) got {enc.shape}"
+        if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
+            padding_mask = encoder_out["encoder_padding_mask"][0]
+        bsz, tgt_len = prev_output_tokens.shape
+        token_position_idx = utils.new_arange(prev_output_tokens)
+        tgt_pos_embed = self.embed_positions(token_position_idx)
+        if code_masks is not None and torch.any(code_masks):
+            image_position_idx = self.image_position_idx[:prev_output_tokens.size(1)].unsqueeze(0).expand(bsz, tgt_len)
+            tgt_pos_embed[code_masks] = self.embed_image_positions(image_position_idx)[code_masks]
+        # self attn position bias
+        self_abs_pos_bias = self.get_pos_info(prev_output_tokens, tgt_pos_embed, use_image=False)
+        if code_masks is not None and torch.any(code_masks):
+            self_image_abs_pos_bias = self.get_pos_info(prev_output_tokens, tgt_pos_embed, use_image=True)
+            self_abs_pos_bias[code_masks] = self_image_abs_pos_bias[code_masks]
+        # cross attn position bias
+        src_pos_embed = encoder_out['position_embeddings'][0]
+        cross_abs_pos_bias = self.get_pos_info(prev_output_tokens, tgt_pos_embed, src_pos_embed=src_pos_embed)
+        if code_masks is not None and torch.any(code_masks):
+            cross_image_abs_pos_bias = self.get_pos_info(prev_output_tokens, tgt_pos_embed, src_pos_embed=src_pos_embed, use_image=True)
+            cross_abs_pos_bias[code_masks] = cross_image_abs_pos_bias[code_masks]
+        cross_abs_pos_bias = cross_abs_pos_bias.reshape(-1, *cross_abs_pos_bias.size()[-2:])
+        all_prev_output_tokens = prev_output_tokens.clone()
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            cross_abs_pos_bias = cross_abs_pos_bias[:, -1:, :]
+            tgt_pos_embed = tgt_pos_embed[:, -1:, :]
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+        if self.entangle_position_embedding is not None and not self.args.disable_entangle:
+            x += tgt_pos_embed
+        if self.layernorm_embedding is not None:
+            if code_masks is None or not code_masks.any() or not getattr(self, "code_layernorm_embedding", False):
+                x = self.layernorm_embedding(x)
+            elif code_masks is not None and code_masks.all():
+                x = self.code_layernorm_embedding(x)
+            else:
+                x[~code_masks] = self.layernorm_embedding(x[~code_masks])
+                x[code_masks] = self.code_layernorm_embedding(x[code_masks])
+        x = self.dropout_module(x)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+            self_attn_bias = self_abs_pos_bias.clone()
+            if code_masks is None or not code_masks.any():
+                self_attn_bias += self.get_rel_pos_bias(all_prev_output_tokens, idx).unsqueeze(0)
+            elif code_masks is not None and code_masks.all():
+                self_attn_bias += self.get_image_rel_pos_bias(all_prev_output_tokens, idx).unsqueeze(0)
+            else:
+                self_attn_bias[~code_masks] += self.get_rel_pos_bias(all_prev_output_tokens, idx).unsqueeze(0)
+                self_attn_bias[code_masks] += self.get_image_rel_pos_bias(all_prev_output_tokens, idx).unsqueeze(0)
+            self_attn_bias = self_attn_bias.reshape(-1, *self_attn_bias.size()[-2:])
+            if incremental_state is not None:
+                self_attn_bias = self_attn_bias[:, -1:, :]
+            x, layer_attn, _ = layer(
+                x,
+                enc,
+                padding_mask,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+                self_attn_bias=self_attn_bias,
+                cross_attn_bias=cross_abs_pos_bias
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+        return x, {"attn": [attn], "inner_states": inner_states}
+    def output_layer(self, features):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            return self.output_projection(features)
+        else:
+            return features
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return self.max_target_positions
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+        if f"{name}.output_projection.weight" not in state_dict:
+            if self.share_input_output_embed:
+                embed_out_key = f"{name}.embed_tokens.weight"
+            else:
+                embed_out_key = f"{name}.embed_out"
+            if embed_out_key in state_dict:
+                state_dict[f"{name}.output_projection.weight"] = state_dict[
+                    embed_out_key
+                ]
+                if not self.share_input_output_embed:
+                    del state_dict[embed_out_key]
+        for i in range(self.num_layers):
+            # update layer norms
+            self.layers[i].upgrade_state_dict_named(
+                state_dict, "{}.layers.{}".format(name, i)
+            )
+        # version_key = "{}.version".format(name)
+        # if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+        #     # earlier checkpoints did not normalize after the stack of layers
+        #     self.layer_norm = None
+        #     self.normalize = False
+        #     state_dict[version_key] = torch.Tensor([1])
+        prefix = name + "." if name != "" else ""
+        image_params = ["image_position_idx"]
+        for image_param in image_params:
+            state_dict[prefix + image_param] = self.state_dict()[image_param]
+        for param_name, param_tensor in self.state_dict().items():
+            if (prefix + param_name) not in state_dict:
+                state_dict[prefix + param_name] = self.state_dict()[param_name]
+        if len(state_dict["decoder.embed_image_positions.weight"]) < len(self.state_dict()["embed_image_positions.weight"]):
+            num_posids_to_add = len(self.state_dict()["embed_image_positions.weight"]) - len(state_dict["decoder.embed_image_positions.weight"])
+            embed_dim = state_dict["decoder.embed_image_positions.weight"].size(1)
+            new_pos_embed_to_add = torch.zeros(num_posids_to_add, embed_dim)
+            nn.init.normal_(new_pos_embed_to_add, mean=0, std=embed_dim ** -0.5)
+            new_pos_embed_to_add = new_pos_embed_to_add.to(
+                dtype=state_dict["decoder.embed_image_positions.weight"].dtype,
+            )
+            state_dict["decoder.embed_image_positions.weight"] = torch.cat(
+                [state_dict["decoder.embed_image_positions.weight"], new_pos_embed_to_add]
+            )
+        return state_dict
+def Embedding(num_embeddings, embedding_dim, padding_idx=None, zero_init=False):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if padding_idx is not None:
+        nn.init.constant_(m.weight[padding_idx], 0)
+    if zero_init:
+        nn.init.constant_(m.weight, 0)
+    return m
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+@register_model_architecture("unify_transformer", "unify_transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.no_cross_attention = getattr(args, "no_cross_attention", False)
+    args.cross_self_attention = getattr(args, "cross_self_attention", False)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.checkpoint_activations = getattr(args, "checkpoint_activations", False)
+    args.offload_activations = getattr(args, "offload_activations", False)
+    if args.offload_activations:
+        args.checkpoint_activations = True
+    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
+    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)

models/ofa/unify_transformer_layer.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor
+from .unify_multihead_attention import MultiheadAttention
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (1, x.shape[1], 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+    def __init__(self, args, drop_path_rate=0.0):
+        super().__init__()
+        self.args = args
+        self.embed_dim = args.encoder_embed_dim
+        self.quant_noise = getattr(args, 'quant_noise_pq', 0)
+        self.quant_noise_block_size = getattr(args, 'quant_noise_pq_block_size', 8) or 8
+        self.self_attn = self.build_self_attention(self.embed_dim, args)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, 'activation_fn', 'relu') or "relu"
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0) or 0
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0) or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.encoder_ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            args.encoder_ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.attn_ln = LayerNorm(self.embed_dim) if getattr(args, 'scale_attn', False) else None
+        self.nh = self.self_attn.num_heads
+        self.head_dim = self.self_attn.head_dim
+        self.ffn_layernorm = LayerNorm(args.encoder_ffn_embed_dim) if getattr(args, 'scale_fc', False) else None
+        self.w_resid = nn.Parameter(torch.ones(self.embed_dim, ), requires_grad=True) if getattr(args, 'scale_resids', False) else None
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+        )
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+        )
+    def build_self_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            scale_factor=args.attn_scale_factor,
+            scale_heads=getattr(args, 'scale_heads', False)
+        )
+    def residual_connection(self, x, residual):
+        return residual + self.drop_path(x)
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
+        for old, new in layer_norm_map.items():
+            for m in ("weight", "bias"):
+                k = "{}.layer_norms.{}.{}".format(name, old, m)
+                if k in state_dict:
+                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
+                    del state_dict[k]
+                if "{}.{}.{}".format(name, new, m) not in state_dict and "{}.{}".format(new, m) in self.state_dict():
+                    state_dict[
+                        "{}.{}.{}".format(name, new, m)
+                    ] = self.state_dict()["{}.{}".format(new, m)]
+        prefix = name + "." if name != "" else ""
+        for param_name, param_tensor in self.state_dict().items():
+            if (prefix + param_name) not in state_dict:
+                state_dict[prefix + param_name] = self.state_dict()[param_name]
+    def forward(
+        self,
+        x,
+        encoder_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor] = None,
+        self_attn_bias: Optional[Tensor] = None
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, seq_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+                where `tgt_len` is the length of output and `src_len` is the
+                length of input, though here both are equal to `seq_len`.
+                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+                useful for strided self-attention.
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(
+                attn_mask.to(torch.bool),
+                -1e8 if x.dtype == torch.float32 else -1e4
+            )
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask,
+            need_weights=False,
+            attn_mask=attn_mask,
+            attn_bias=self_attn_bias
+        )
+        if self.attn_ln is not None:
+            x = self.attn_ln(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        if self.ffn_layernorm is not None:
+            x = self.ffn_layernorm(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if self.w_resid is not None:
+            residual = torch.mul(self.w_resid, residual)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return x
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, drop_path_rate=0.0
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.quant_noise = getattr(args, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.self_attn_ln = LayerNorm(self.embed_dim) if getattr(args, 'scale_attn', False) else None
+        self.cross_attn_ln = LayerNorm(self.embed_dim) if getattr(args, 'scale_attn', False) else None
+        self.nh = self.self_attn.num_heads
+        self.head_dim = self.self_attn.head_dim
+        self.activation_fn = utils.get_activation_fn(
+            activation=str(args.activation_fn)
+            if getattr(args, "activation_fn", None) is not None
+            else "relu"
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0) or 0
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0) or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.decoder_normalize_before
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, "char_inputs", False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.ffn_layernorm = LayerNorm(args.decoder_ffn_embed_dim) if getattr(args, 'scale_fc', False) else None
+        self.w_resid = nn.Parameter(torch.ones(self.embed_dim, ), requires_grad=True) if getattr(args, 'scale_resids', False) else None
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.decoder_ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            args.decoder_ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+        self.onnx_trace = False
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+    def build_self_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not getattr(args, "cross_self_attention", False),
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            scale_factor=args.attn_scale_factor,
+            scale_heads=getattr(args, 'scale_heads', False)
+        )
+    def build_encoder_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            scale_factor=args.attn_scale_factor,
+            scale_heads=getattr(args, 'scale_heads', False)
+        )
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def residual_connection(self, x, residual):
+        return residual + self.drop_path(x)
+    def forward(
+        self,
+        x,
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+        self_attn_bias: Optional[Tensor] = None,
+        cross_attn_bias: Optional[Tensor] = None
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        if need_head_weights:
+            need_attn = True
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        if prev_self_attn_state is not None:
+            prev_key, prev_value = prev_self_attn_state[:2]
+            saved_state: Dict[str, Optional[Tensor]] = {
+                "prev_key": prev_key,
+                "prev_value": prev_value,
+            }
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+            assert incremental_state is not None
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+        if self.cross_self_attention and not (
+            incremental_state is not None
+            and _self_attn_input_buffer is not None
+            and "prev_key" in _self_attn_input_buffer
+        ):
+            if self_attn_mask is not None:
+                assert encoder_out is not None
+                self_attn_mask = torch.cat(
+                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
+                )
+            if self_attn_padding_mask is not None:
+                if encoder_padding_mask is None:
+                    assert encoder_out is not None
+                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
+                        encoder_out.size(1), encoder_out.size(0)
+                    )
+                self_attn_padding_mask = torch.cat(
+                    (encoder_padding_mask, self_attn_padding_mask), dim=1
+                )
+            assert encoder_out is not None
+            y = torch.cat((encoder_out, x), dim=0)
+        else:
+            y = x
+        x, attn = self.self_attn(
+            query=x,
+            key=y,
+            value=y,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+            attn_bias=self_attn_bias
+        )
+        if self.self_attn_ln is not None:
+            x = self.self_attn_ln(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        if self.encoder_attn is not None and encoder_out is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+            if prev_attn_state is not None:
+                prev_key, prev_value = prev_attn_state[:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                assert incremental_state is not None
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+                attn_bias=cross_attn_bias
+            )
+            if self.cross_attn_ln is not None:
+                x = self.cross_attn_ln(x)
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        if self.ffn_layernorm is not None:
+            x = self.ffn_layernorm(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if self.w_resid is not None:
+            residual = torch.mul(self.w_resid, residual)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            assert saved_state is not None
+            if self_attn_padding_mask is not None:
+                self_attn_state = [
+                    saved_state["prev_key"],
+                    saved_state["prev_value"],
+                    saved_state["prev_key_padding_mask"],
+                ]
+            else:
+                self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
+            return x, attn, self_attn_state
+        return x, attn, None
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        # update layer norms
+        layer_norm_map = {
+            "0": "self_attn_layer_norm",
+            "1": "encoder_attn_layer_norm",
+            "2": "final_layer_norm",
+        }
+        for old, new in layer_norm_map.items():
+            for m in ("weight", "bias"):
+                k = "{}.layer_norms.{}.{}".format(name, old, m)
+                if k in state_dict:
+                    state_dict[
+                        "{}.{}.{}".format(name, new, m)
+                    ] = state_dict[k]
+                    del state_dict[k]
+                if "{}.{}.{}".format(name, new, m) not in state_dict and "{}.{}".format(new, m) in self.state_dict():
+                    state_dict[
+                        "{}.{}.{}".format(name, new, m)
+                    ] = self.state_dict()["{}.{}".format(new, m)]
+        prefix = name + "." if name != "" else ""
+        for param_name, param_tensor in self.state_dict().items():
+            if (prefix + param_name) not in state_dict:
+                state_dict[prefix + param_name] = self.state_dict()[param_name]

models/search.py ADDED Viewed

	@@ -0,0 +1,814 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import math
+from typing import List, Optional
+import torch
+import torch.nn as nn
+from fairseq.token_generation_constraints import (
+    ConstraintState,
+    OrderedConstraintState,
+    UnorderedConstraintState,
+)
+from torch import Tensor
+class Search(nn.Module):
+    def __init__(self, tgt_dict):
+        super().__init__()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.src_lengths = torch.tensor(-1)
+        self.supports_constraints = False
+        self.stop_on_max_len = False
+    def step(
+        self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None
+    ):
+        """Take a single search step.
+        Args:
+            step: the current search step, starting at 0
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+            scores: (bsz x input_beam_size x step)
+                the historical model scores of each hypothesis up to this point
+            prev_output_tokens: (bsz x step)
+                the previously generated oputput tokens
+            original_batch_idxs: (bsz)
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the orignal indices
+        Return: A tuple of (scores, indices, beams) where:
+            scores: (bsz x output_beam_size)
+                the scores of the chosen elements; output_beam_size can be
+                larger than input_beam_size, e.g., we may return
+                2*input_beam_size to account for EOS
+            indices: (bsz x output_beam_size)
+                the indices of the chosen elements
+            beams: (bsz x output_beam_size)
+                the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
+        """
+        raise NotImplementedError
+    @torch.jit.export
+    def set_src_lengths(self, src_lengths):
+        self.src_lengths = src_lengths
+    @torch.jit.export
+    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
+        """Initialize constraint states for constrained decoding (if supported).
+        Args:
+            batch_constraints: (torch.Tensor, optional)
+                the list of constraints, in packed form
+            beam_size: (int)
+                the beam size
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        pass
+    def prune_sentences(self, batch_idxs: Tensor):
+        """
+        Removes constraint states for completed sentences (if supported).
+        This is called from sequence_generator._generate() when sentences are
+        deleted from the batch.
+        Args:
+            batch_idxs: Indices of *sentences* whose constraint state should be *kept*.
+        """
+        pass
+    def update_constraints(self, active_hypos: Tensor):
+        """
+        Updates the constraint states by selecting the beam items that are retained.
+        This is called at each time step of sequence_generator._generate() when
+        the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size.
+        Args:
+            active_hypos: (batch size, beam size)
+              list of integers denoting, for each sentence, which beam candidate items
+              should be kept.
+        """
+        pass
+class BeamSearch(Search):
+    def __init__(self, tgt_dict):
+        super().__init__(tgt_dict)
+        self.constraint_states = None
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores: Optional[Tensor],
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+        top_prediction = torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best 2 x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size * 2,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+        )
+        scores_buf = top_prediction[0]
+        indices_buf = top_prediction[1]
+        # Project back into relative indices and beams
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+        # At this point, beams_buf and indices_buf are single-dim and contain relative indices
+        return scores_buf, indices_buf, beams_buf
+class PrefixConstrainedBeamSearch(Search):
+    def __init__(self, tgt_dict, prefix_allowed_tokens_fn):
+        super().__init__(tgt_dict)
+        self.prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self.stop_on_max_len = True
+    @torch.jit.export
+    def apply_mask(self, x, prev_output_tokens, original_batch_idxs):
+        beam_size = x.shape[0] // original_batch_idxs.shape[0]
+        original_batch_idxs = (
+            original_batch_idxs.unsqueeze(-1).repeat((1, beam_size)).flatten().tolist()
+        )
+        mask = torch.full_like(x, -math.inf)
+        for sent_i, (sent, batch_i) in enumerate(
+            zip(prev_output_tokens, original_batch_idxs)
+        ):
+            mask[sent_i, :, self.prefix_allowed_tokens_fn(batch_i, sent)] = 0
+        return mask
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs: Tensor,
+        scores: Tensor,
+        prev_output_tokens: Tensor,
+        original_batch_idxs: Tensor,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        lprobs += self.apply_mask(
+            lprobs.view(bsz * beam_size, 1, vocab_size),
+            prev_output_tokens,
+            original_batch_idxs,
+        ).view(bsz, beam_size, vocab_size)
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+        top_prediction = torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+        )
+        scores_buf = top_prediction[0]
+        indices_buf = top_prediction[1]
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+        return scores_buf, indices_buf, beams_buf
+class LexicallyConstrainedBeamSearch(Search):
+    """Implements lexically constrained beam search as described in
+        Fast Lexically Constrained Decoding with Dynamic Beam
+        Allocation for Neural Machine Translation.  Post & Vilar,
+        NAACL 2018.  https://www.aclweb.org/anthology/N18-1119/
+    and
+        Improved Lexically Constrained Decoding for Translation and
+        Monolingual Rewriting. Hu et al, NAACL
+        2019. https://www.aclweb.org/anthology/N19-1090/
+    This is accomplished by maintaining, for each beam hypothesis, a
+    ConstraintState object (see constraints.py) that tracks which
+    constraints have been generated and using this information to
+    shape the beam for each input sentence.
+    """
+    def __init__(self, tgt_dict, representation):
+        super().__init__(tgt_dict)
+        self.representation = representation
+        self.vocab_size = len(tgt_dict)
+        self.num_cands = 0
+        self.supports_constraints = True
+    @torch.jit.export
+    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
+        self.constraint_states = []
+        for constraint_tensor in batch_constraints:
+            if self.representation == "ordered":
+                constraint_state = OrderedConstraintState.create(constraint_tensor)
+            elif self.representation == "unordered":
+                constraint_state = UnorderedConstraintState.create(constraint_tensor)
+            self.constraint_states.append([constraint_state for i in range(beam_size)])
+    @torch.jit.export
+    def prune_sentences(self, batch_idxs: Tensor):
+        self.constraint_states = [
+            self.constraint_states[i] for i in batch_idxs.tolist()
+        ]
+    @torch.jit.export
+    def update_constraints(self, active_hypos: Tensor):
+        if self.constraint_states:
+            batch_size = active_hypos.size(0)
+            for sentid in range(batch_size):
+                self.constraint_states[sentid] = [
+                    self.constraint_states[sentid][i] for i in active_hypos[sentid]
+                ]
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs: Tensor,
+        scores: Optional[Tensor],
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        """
+        A constrained step builds a large candidates list from the following:
+        - the top 2 * {beam_size} items over the whole beam
+        - for each item in the beam
+          - the top {each_k} (default 1)
+          - all next constraints
+        We then compute the constrained state of each beam item, and assign
+        stripe codes: 0 to the best in each bank, 1 to the 2nd-best, and so
+        on. We then sort by (stripe, score), and truncate the list at
+        2 * beam size.
+        Args:
+            step: the decoder step
+            lprobs: (batch size, beam size, target vocab)
+                the target-vocab distributions for each item in the beam.
+        Retrun: A tuple of (scores, indices, beams, constraints) where:
+            scores: (batch, output beam size)
+                the scores of the chosen elements
+            indices: (batch, output beam size)
+                the target vocab indices of the chosen elements
+            beams: (batch, output beam size)
+                the 0-indexed hypothesis ids of the chosen elements
+            constraints: (batch, output beam size)
+                the new constraint states
+        """
+        each_k = 1
+        device = lprobs.device
+        batch_size, beam_size, vocab_size = lprobs.size()
+        self.num_cands = min(
+            # Just take the k-best. We'll get another k from the 1-best from each
+            # row, plus more from the constraints
+            beam_size * 2,
+            lprobs.view(batch_size, -1).size(1) - 1,  # -1 so we never select pad
+        )
+        # STEP 0: Preliminary. Prevent EOS for unfinished hyps across all batch items
+        constraint_states = self.constraint_states
+        if constraint_states and step > 0:
+            not_finished_indices = []
+            for sentno, sent_constraints in enumerate(constraint_states):
+                for beamno, state in enumerate(sent_constraints):
+                    index = sentno * beam_size + beamno
+                    if not state.finished:
+                        not_finished_indices.append(index)
+            not_finished_indices = torch.tensor(not_finished_indices)
+            if not_finished_indices.numel() > 0:
+                lprobs.view(batch_size * beam_size, -1)[
+                    not_finished_indices, self.eos
+                ] = -math.inf
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam entry for each batch item
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+        top_prediction = torch.topk(
+            lprobs.view(batch_size, -1),
+            self.num_cands,
+        )
+        scores_buf, indices_buf = top_prediction
+        # Project back into relative indices and beams
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+        # Short circuit if there are no constraints in this batch
+        if not constraint_states:
+            return scores_buf, indices_buf, beams_buf
+        # STEP 1: get top-1 from each hypothesis across all sentences in the batch
+        if step > 0:
+            top_scores, top_indices = torch.topk(
+                lprobs.view(batch_size * beam_size, -1),
+                k=each_k,
+                dim=1,
+            )
+            top_scores = top_scores.view(batch_size, -1)
+            top_indices = top_indices.view(batch_size, -1)
+            scores_buf = torch.cat((scores_buf, top_scores), dim=1)
+            indices_buf = torch.cat((indices_buf, top_indices), dim=1)
+            new_beams = torch.arange(0, beam_size, device=device).repeat(batch_size, 1)
+            beams_buf = torch.cat((beams_buf, new_beams), dim=1)
+        # Now, process sentences in the batch one by one.
+        new_scores_buf = torch.zeros((batch_size, 2 * beam_size), device=device)
+        new_indices_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
+        new_beams_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
+        for sentno, states in enumerate(constraint_states):
+            scores, indices, beams, new_states = self.step_sentence(
+                step,
+                sentno,
+                lprobs[sentno],
+                constraint_states[sentno],
+                beams_buf[sentno].clone(),
+                indices_buf[sentno].clone(),
+                scores_buf[sentno].clone(),
+            )
+            new_scores_buf[sentno] = scores
+            new_indices_buf[sentno] = indices
+            new_beams_buf[sentno] = beams
+            self.constraint_states[sentno] = new_states
+        return new_scores_buf, new_indices_buf, new_beams_buf
+    @torch.jit.export
+    def step_sentence(
+        self,
+        step: int,
+        sentno: int,
+        lprobs: Tensor,
+        constraint_states: List[List[ConstraintState]],
+        beams_buf: Tensor,
+        indices_buf: Tensor,
+        scores_buf: Tensor,
+    ):
+        """Does per-sentence processing. Adds all constraints for each
+        hypothesis to the list of candidates; then removes duplicates,
+        sorts, and dynamically stripes across the banks. All tensor inputs
+        are collapsed to those pertaining to a single input sentence.
+        """
+        device = lprobs.device
+        # STEP 2: Add all constraints for each beam item
+        for beamno, state in enumerate(constraint_states):
+            next_tokens = torch.tensor(list(state.next_tokens()), device=device).long()
+            if next_tokens.numel() != 0:
+                indices_buf = torch.cat((indices_buf, next_tokens))
+                next_beams = (
+                    torch.tensor(beamno, device=device)
+                    .repeat(next_tokens.size(0))
+                    .long()
+                )
+                beams_buf = torch.cat((beams_buf, next_beams))
+                next_values = lprobs[beamno].take(next_tokens.view(-1))
+                scores_buf = torch.cat((scores_buf, next_values))
+            # At the 0th time step, there is just one beam item
+            if step == 0:
+                break
+        # STEP 3: Compute the "bank" for each candidate. This is the
+        # number of constraints it's generated. We need this so that
+        # we can do round-robin allocation of the beam across these
+        # banks. If C is the number of constraints, we select the best
+        # item in bank C, then the best in bank C-1, etc, followed by
+        # the 2nd-best in bank C, the 2nd-best in bank C-1, etc, and so
+        # on, until the maximum beam size. We accomplish this by
+        # creating a sort key and striping across the banks.
+        # Compute the new states for all candidates
+        cands_size = indices_buf.size(0)
+        constraint_states = [
+            constraint_states[beams_buf[i]].advance(indices_buf[i])
+            for i in range(cands_size)
+        ]
+        banks = torch.tensor([state.bank for state in constraint_states], device=device)
+        # STEP 4: Sort
+        num_constraint_tokens = len(state.tokens)
+        # Sort by keys (bank, score) (i.e., sort banks together, and scores
+        # within banks). AFAIK pytorch doesn't support either stable sort or
+        # multi-key sorting, so we have to hack this.
+        MAX_SCORE = -100
+        sort_key = (num_constraint_tokens - banks) * MAX_SCORE + scores_buf
+        sort_values, sort_indices = sort_key.sort(dim=0, descending=True)
+        scores_buf = scores_buf[sort_indices]
+        indices_buf = indices_buf[sort_indices]
+        beams_buf = beams_buf[sort_indices]
+        banks = banks[sort_indices]
+        # Sort the constraints to follow suit
+        constraint_states = [constraint_states[i] for i in sort_indices]
+        # STEP 5: Remove duplicates. The topk calls (overall and
+        # per-row) plus the per-row generation of constraints will
+        # produce duplicates. Here we remove them.
+        def roll(t):
+            """Rolls a 1d tensor left by 1.
+            [0, 1, 2, 3, 4] becomes [4, 0, 1, 2, 3]
+            """
+            return torch.cat((t[-1].unsqueeze(0), t[0:-1]), dim=0)
+        # We map candidates (beam, token_id) to a single dimension.
+        # This is then shifted by 1. We can then easily identify
+        # duplicates and create a mask that identifies unique
+        # extensions.
+        uniques_mask = beams_buf * (self.vocab_size + 1) + indices_buf
+        uniques_mask = roll(uniques_mask) != uniques_mask
+        # Use the mask to pare down the data structures
+        scores_buf = torch.masked_select(scores_buf, uniques_mask)
+        indices_buf = torch.masked_select(indices_buf, uniques_mask)
+        beams_buf = torch.masked_select(beams_buf, uniques_mask)
+        banks = torch.masked_select(banks, uniques_mask)
+        i = 1
+        for mask in uniques_mask[1:]:
+            if not mask:
+                constraint_states.pop(i)
+            i += mask
+        # STEP 6: Assign IDs round-robin across banks, sort, and
+        # truncate. Now that the candidates are sorted by (bank,
+        # score) and uniqed, we dynamically allocate the {beam_size}
+        # beam by striping across the candidates. These stripes will
+        # be used as sort keys to do round-robin selection. This is
+        # accomplished in a single pass with offsets. Sorting by
+        # highest-banks (furthest-along hypotheses) first ensures
+        # progress through the constraints.
+        #
+        # e.g., BANKS: 3 3 3 2 2 2 2 1 1 1 0 0
+        # OLD STRIPES: 0 1 2 0 1 2 3 0 1 2 0 1
+        # NEW STRIPES: 0 1+4 2+8 0+1 1+5 2+9 3+11 0+2 1+6 2+10 0+3 1+7
+        #            = 0 5 10 1 6 11 13 2 7 12 3 8
+        #
+        # Sorting by this then gives the following banks:
+        #
+        #             3 2 1 0 3 2 1 0 3 2 1 2
+        #
+        # We'll take the top {beam_size} of these.
+        stripe_offsets = [offset * (len(banks) + 1) for offset in range(len(banks) + 1)]
+        stripes = torch.zeros_like(banks)
+        cur_bank_count = -1
+        cur_bank = banks[0]
+        for i, bank in enumerate(banks):
+            if bank != cur_bank:
+                cur_bank_count = 0
+                cur_bank = bank
+            else:
+                cur_bank_count += 1
+            stripes[i] = num_constraint_tokens - bank + stripe_offsets[cur_bank_count]
+        # STEP 7: Sort by the stripes values
+        sort_values, sort_indices = stripes.sort(dim=0)
+        scores_buf = scores_buf[sort_indices]
+        indices_buf = indices_buf[sort_indices]
+        beams_buf = beams_buf[sort_indices]
+        constraint_states = [constraint_states[i] for i in sort_indices]
+        # STEP 8: Truncate to the candidates size!
+        scores_buf = scores_buf[: self.num_cands]
+        indices_buf = indices_buf[: self.num_cands]
+        beams_buf = beams_buf[: self.num_cands]
+        return scores_buf, indices_buf, beams_buf, constraint_states
+class LengthConstrainedBeamSearch(Search):
+    def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
+        super().__init__(tgt_dict)
+        self.min_len_a = min_len_a
+        self.min_len_b = min_len_b
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.beam = BeamSearch(tgt_dict)
+        self.needs_src_lengths = True
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        min_lens = self.min_len_a * self.src_lengths + self.min_len_b
+        max_lens = self.max_len_a * self.src_lengths + self.max_len_b
+        lprobs[step < min_lens, :, self.eos] = -math.inf
+        lprobs[step >= max_lens, :, self.eos] = 0
+        return self.beam.step(step, lprobs, scores)
+class DiverseBeamSearch(Search):
+    """Diverse Beam Search.
+    See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
+    Models" for details.
+    We only implement the Hamming Diversity penalty here, which performed best
+    in the original paper.
+    """
+    def __init__(self, tgt_dict, num_groups, diversity_strength):
+        super().__init__(tgt_dict)
+        self.num_groups = num_groups
+        self.diversity_strength = -diversity_strength
+        self.beam = BeamSearch(tgt_dict)
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        if beam_size % self.num_groups != 0:
+            raise ValueError(
+                "DiverseBeamSearch requires --beam to be divisible by the number of groups"
+            )
+        # initialize diversity penalty
+        diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs)
+        scores_G, indices_G, beams_G = [], [], []
+        for g in range(self.num_groups):
+            lprobs_g = lprobs[:, g :: self.num_groups, :]
+            scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None
+            # apply diversity penalty
+            if g > 0:
+                lprobs_g = torch.add(
+                    lprobs_g,
+                    other=diversity_buf.unsqueeze(1),
+                    alpha=self.diversity_strength,
+                )
+            else:
+                lprobs_g = lprobs_g.contiguous()
+            scores_buf, indices_buf, beams_buf = self.beam.step(
+                step, lprobs_g, scores_g
+            )
+            beams_buf.mul_(self.num_groups).add_(g)
+            scores_G.append(scores_buf.clone())
+            indices_G.append(indices_buf.clone())
+            beams_G.append(beams_buf.clone())
+            # update diversity penalty
+            diversity_buf.scatter_add_(
+                1, indices_buf, torch.ones(indices_buf.size()).to(diversity_buf)
+            )
+        # interleave results from different groups
+        scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1)
+        indices_buf = torch.stack(indices_G, dim=2).view(bsz, -1)
+        beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1)
+        return scores_buf, indices_buf, beams_buf
+class Sampling(Search):
+    sampling_topk: int
+    sampling_topp: float
+    def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0):
+        super().__init__(tgt_dict)
+        self.sampling_topk = sampling_topk
+        self.sampling_topp = sampling_topp
+    def _sample_topp(self, lprobs):
+        """Sample among the smallest set of elements whose cumulative probability mass exceeds p.
+        See `"The Curious Case of Neural Text Degeneration"
+        (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.
+        Args:
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+        Return: A tuple of (trimed_probs, truncated_indices) where:
+            trimed_probs: (bsz x input_beam_size x ?)
+                the model's probabilities over the elements selected to sample from. The
+                width of the third dimension is determined by top-P.
+            truncated_indices: (bsz x input_beam_size x ?)
+                the indices of the chosen elements.
+        """
+        probs = lprobs.exp_()
+        # sort the last dimension (vocab dimension) in descending order
+        sorted_probs, sorted_indices = probs.sort(descending=True)
+        # compute a mask to indicate the words to be included in the top-P set.
+        cumsum_probs = sorted_probs.cumsum(dim=2)
+        mask = cumsum_probs.lt(self.sampling_topp)
+        # note that mask was computed by 'lt'. One more word needs to be included
+        # so that the cumulative probability mass can exceed p.
+        cumsum_mask = mask.cumsum(dim=2)
+        last_included = cumsum_mask[:, :, -1:]
+        last_included.clamp_(0, mask.size()[2] - 1)
+        mask = mask.scatter_(2, last_included, 1)
+        # truncate unnecessary dims.
+        max_dim = last_included.max()
+        truncated_mask = mask[:, :, : max_dim + 1]
+        truncated_probs = sorted_probs[:, :, : max_dim + 1]
+        truncated_indices = sorted_indices[:, :, : max_dim + 1]
+        # trim the words that are not in top-P by setting their probabilities
+        # to 0, so that they would not be sampled later.
+        trim_mask = ~truncated_mask
+        trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
+        return trimed_probs, truncated_indices
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        if self.sampling_topp > 0:
+            # only sample from the smallest set of words whose cumulative probability mass exceeds p
+            probs, top_indices = self._sample_topp(lprobs)
+        elif self.sampling_topk > 0:
+            # only sample from top-k candidates
+            lprobs, top_indices = lprobs.topk(self.sampling_topk)
+            probs = lprobs.exp_()
+        else:
+            probs = lprobs.exp_()
+            # dummy data to be consistent with true branch for type check
+            top_indices = torch.empty(0).to(probs)
+        # sample
+        if step == 0:
+            indices_buf = torch.multinomial(
+                probs.view(bsz, -1),
+                beam_size,
+                replacement=True,
+            ).view(bsz, beam_size)
+        else:
+            indices_buf = torch.multinomial(
+                probs.view(bsz * beam_size, -1),
+                1,
+                replacement=True,
+            ).view(bsz, beam_size)
+        if step == 0:
+            # expand to beam size
+            probs = probs.expand(bsz, beam_size, -1)
+        # gather scores
+        scores_buf = torch.gather(probs, dim=2, index=indices_buf.unsqueeze(-1))
+        scores_buf = scores_buf.log_().view(bsz, -1)
+        # remap indices if using top-k or top-P sampling
+        if self.sampling_topk > 0 or self.sampling_topp > 0:
+            indices_buf = torch.gather(
+                top_indices.expand(bsz, beam_size, -1),
+                dim=2,
+                index=indices_buf.unsqueeze(-1),
+            ).squeeze(2)
+        if step == 0:
+            beams_buf = indices_buf.new_zeros(bsz, beam_size)
+        else:
+            beams_buf = torch.arange(0, beam_size).to(indices_buf).repeat(bsz, 1)
+            # make scores cumulative
+            scores_buf.add_(
+                torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf)
+            )
+        return scores_buf, indices_buf, beams_buf
+class DiverseSiblingsSearch(Search):
+    """
+    Beam search with diverse siblings.
+    See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details.
+    https://arxiv.org/abs/1611.08562
+    1/ Calculate hypotheses for each beam
+    2/ Intra-sibling ordering
+    3/ Rewrite scores
+    4/ Choose top K hypotheses
+    if diversity_rate == 0 is equivalent to BeamSearch
+    """
+    def __init__(self, tgt_dict, diversity_rate):
+        super().__init__(tgt_dict)
+        self.diversity_rate = diversity_rate
+        self.beam = BeamSearch(tgt_dict)
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        k = min(
+            # Take the best 2 x beam_size predictions. We'll choose the first
+            # beam_size of these which don't predict eos to continue with.
+            beam_size * 2,
+            lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+        )
+        s_list: List[Tensor]
+        i_list: List[Tensor]
+        s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)]
+        i_list = [torch.LongTensor().to(device=lprobs.device) for i in range(beam_size)]
+        sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate
+        if step == 0:
+            return self.beam.step(step, lprobs, scores)
+        lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
+        # 1/ Calculate hypotheses for each beam
+        for i in range(beam_size):
+            torch.topk(lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i]))
+            i_list[i].fmod_(vocab_size)
+            # 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores
+            s_list[i].sub_(sibling_score)
+        # 4/ Choose top K hypotheses
+        indices = torch.stack(i_list, dim=1).view(bsz, -1)
+        final_scores = torch.empty(0).to(lprobs)
+        final_indices = torch.LongTensor().to(device=lprobs.device)
+        final_beams = torch.LongTensor().to(device=lprobs.device)
+        (final_scores, final_indices) = torch.topk(
+            torch.stack(s_list, dim=1).view(bsz, -1),
+            k,
+        )
+        final_beams = final_indices // k
+        for i in range(bsz):
+            final_indices[i] = indices[i][final_indices[i]]
+        return final_scores, final_indices, final_beams

models/sequence_generator.py ADDED Viewed

	@@ -0,0 +1,1053 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import math
+from typing import Dict, List, Optional
+import sys
+import torch
+import torch.nn as nn
+from fairseq import search, utils
+from fairseq.models import FairseqIncrementalDecoder
+from torch import Tensor
+from fairseq.ngram_repeat_block import NGramRepeatBlock
+from data import data_utils
+class SequenceGenerator(nn.Module):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        max_len=0,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        search_strategy=None,
+        eos=None,
+        symbols_to_strip_from_output=None,
+        lm_model=None,
+        lm_weight=1.0,
+        constraint_trie=None,
+        constraint_range=None,
+        gen_code=False,
+        gen_box=False,
+        ignore_eos=False,
+        zero_shot=False
+    ):
+        """Generates translations of a given source sentence.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            max_len (int, optional): the maximum length of the generated output
+                (not including end-of-sentence)
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+        """
+        super().__init__()
+        if isinstance(models, EnsembleModel):
+            self.model = models
+        else:
+            self.model = EnsembleModel(models)
+        self.gen_code = gen_code
+        self.gen_box = gen_box
+        self.ignore_eos = ignore_eos
+        self.tgt_dict = tgt_dict
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.bos = tgt_dict.bos()
+        self.eos = tgt_dict.eos() if eos is None else eos
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None
+            else {self.bos, self.eos}
+        )
+        self.vocab_size = len(tgt_dict)
+        self.beam_size = beam_size
+        # the max beam size is the dictionary size - 1, since we never select pad
+        self.beam_size = min(beam_size, self.vocab_size - 1)
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.min_len = min_len
+        self.max_len = max_len or self.model.max_decoder_positions()
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+        self.unk_penalty = unk_penalty
+        self.temperature = temperature
+        self.match_source_len = match_source_len
+        self.zero_shot = zero_shot
+        if no_repeat_ngram_size > 0:
+            self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size)
+        else:
+            self.repeat_ngram_blocker = None
+        assert temperature > 0, "--temperature must be greater than 0"
+        self.search = (
+            search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy
+        )
+        # We only need to set src_lengths in LengthConstrainedBeamSearch.
+        # As a module attribute, setting it would break in multithread
+        # settings when the model is shared.
+        self.should_set_src_lengths = (
+            hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths
+        )
+        self.model.eval()
+        self.lm_model = lm_model
+        self.lm_weight = lm_weight
+        if self.lm_model is not None:
+            self.lm_model.eval()
+        self.constraint_trie = constraint_trie
+        self.constraint_start = None
+        self.constraint_end = None
+        if constraint_range is not None:
+            constraint_start, constraint_end = constraint_range.split(',')
+            self.constraint_start = int(constraint_start)
+            self.constraint_end = int(constraint_end)
+    def cuda(self):
+        self.model.cuda()
+        return self
+    @torch.no_grad()
+    def forward(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        """Generate a batch of translations.
+        Args:
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, prefix_tokens, bos_token=bos_token)
+    # TODO(myleott): unused, deprecate after pytorch-translate migration
+    def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
+        """Iterate over a batched dataset and yield individual translations.
+        Args:
+            cuda (bool, optional): use GPU for generation
+            timer (StopwatchMeter, optional): time generations
+        """
+        for sample in data_itr:
+            s = utils.move_to_cuda(sample) if cuda else sample
+            if "net_input" not in s:
+                continue
+            input = s["net_input"]
+            # model.forward normally channels prev_output_tokens into the decoder
+            # separately, but SequenceGenerator directly calls model.encoder
+            encoder_input = {
+                k: v for k, v in input.items() if k != "prev_output_tokens"
+            }
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(encoder_input)
+            if timer is not None:
+                timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
+            for i, id in enumerate(s["id"].data):
+                # remove padding
+                src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
+                ref = (
+                    utils.strip_pad(s["target"].data[i, :], self.pad)
+                    if s["target"] is not None
+                    else None
+                )
+                yield id, src, ref, hypos[i]
+    @torch.no_grad()
+    def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs) -> List[List[Dict[str, Tensor]]]:
+        """Generate translations. Match the api of other fairseq generators.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            constraints (torch.LongTensor, optional): force decoder to include
+                the list of constraints
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(models, sample, **kwargs)
+    def _generate(
+        self,
+        models,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        model = EnsembleModel(models)
+        incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(model.models_size)
+            ],
+        )
+        net_input = sample["net_input"]
+        if "src_tokens" in net_input:
+            src_tokens = net_input["src_tokens"]
+            # length of the source text being the character length except EndOfSentence and pad
+            src_lengths = (
+                (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)
+            )
+        elif "source" in net_input:
+            src_tokens = net_input["source"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        elif "features" in net_input:
+            src_tokens = net_input["features"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        else:
+            raise Exception("expected src_tokens or source in net input. input keys: " + str(net_input.keys()))
+        # bsz: total number of sentences in beam
+        # Note that src_tokens may have more than 2 dimensions (i.e. audio features)
+        bsz, src_len = src_tokens.size()[:2]
+        beam_size = self.beam_size
+        if constraints is not None and not self.search.supports_constraints:
+            raise NotImplementedError(
+                "Target-side constraints were provided, but search method doesn't support them"
+            )
+        # Initialize constraints, when active
+        self.search.init_constraints(constraints, beam_size)
+        max_len: int = -1
+        if self.match_source_len:
+            max_len = src_lengths.max().item()
+        else:
+            max_len = int(self.max_len_a * src_len + self.max_len_b)
+        assert (
+            self.min_len <= max_len
+        ), "min_len cannot be larger than max_len, please adjust these!"
+        # compute the encoder output for each beam
+        with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"):
+            encoder_outs = model.forward_encoder(net_input)
+        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = model.reorder_encoder_out(encoder_outs, new_order)
+        # ensure encoder_outs is a List.
+        assert encoder_outs is not None
+        # initialize buffers
+        scores = (
+            torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float()
+        )  # +1 for eos; pad is never chosen for scoring
+        tokens = (
+            torch.zeros(bsz * beam_size, max_len + 2)
+            .to(src_tokens)
+            .long()
+            .fill_(self.pad)
+        )  # +2 for eos and pad
+        # tokens[:, 0] = self.eos if bos_token is None else bos_token
+        tokens[:, 0] = self.bos
+        attn: Optional[Tensor] = None
+        # A list that indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = (
+            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
+        )  # forward and backward-compatible False mask
+        # list of completed sentences
+        finalized = torch.jit.annotate(
+            List[List[Dict[str, Tensor]]],
+            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
+        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
+        # a boolean array indicating if the sentence at the index is finished or not
+        finished = [False for i in range(bsz)]
+        num_remaining_sent = bsz  # number of sentences remaining
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (
+            (torch.arange(0, bsz) * beam_size)
+            .unsqueeze(1)
+            .type_as(tokens)
+            .to(src_tokens.device)
+        )
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device)
+        reorder_state: Optional[Tensor] = None
+        batch_idxs: Optional[Tensor] = None
+        original_batch_idxs: Optional[Tensor] = None
+        if "id" in sample and isinstance(sample["id"], Tensor):
+            original_batch_idxs = sample["id"]
+        else:
+            original_batch_idxs = torch.arange(0, bsz).type_as(tokens)
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(
+                        batch_idxs
+                    )
+                    reorder_state.view(-1, beam_size).add_(
+                        corr.unsqueeze(-1) * beam_size
+                    )
+                    original_batch_idxs = original_batch_idxs[batch_idxs]
+                model.reorder_incremental_state(incremental_states, reorder_state)
+                encoder_outs = model.reorder_encoder_out(
+                    encoder_outs, reorder_state
+                )
+            with torch.autograd.profiler.record_function("EnsembleModel: forward_decoder"):
+                lprobs, avg_attn_scores = model.forward_decoder(
+                    tokens[:, : step + 1],
+                    encoder_outs,
+                    incremental_states,
+                    self.temperature,
+                    constraint_trie=self.constraint_trie,
+                    constraint_start=self.constraint_start,
+                    constraint_end=self.constraint_end,
+                    gen_code=self.gen_code,
+                    zero_shot=self.zero_shot,
+                    prefix_tokens=prefix_tokens
+                )
+            if self.lm_model is not None:
+                lm_out = self.lm_model(tokens[:, : step + 1])
+                probs = self.lm_model.get_normalized_probs(
+                    lm_out, log_probs=True, sample=None
+                )
+                probs = probs[:, -1, :] * self.lm_weight
+                lprobs += probs
+            # handle prefix tokens (possibly with different lengths)
+            if (
+                prefix_tokens is not None
+                and step < prefix_tokens.size(1)
+                and step < max_len
+            ):
+                lprobs, tokens, scores = self._prefix_tokens(
+                    step, lprobs, scores, tokens, prefix_tokens, beam_size
+                )
+            elif step < self.min_len:
+                # minimum length constraint (does not apply if using prefix_tokens)
+                lprobs[:, self.eos] = -math.inf
+            lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)
+            lprobs[:, self.pad] = -math.inf  # never select pad
+            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
+            if (self.gen_code or self.gen_box) and step < max_len:
+                lprobs[:, :4] = -math.inf
+            if self.gen_box:
+                lprobs[:, -1] = -math.inf
+                if (step + 1) % 5 == 0:
+                    lprobs[:, self.constraint_start:59457] = -math.inf
+                else:
+                    lprobs[:, 59457:] = -math.inf
+            # handle max length constraint
+            if step >= max_len:
+                lprobs[:, : self.eos] = -math.inf
+                lprobs[:, self.eos + 1 :] = -math.inf
+                if self.ignore_eos:
+                    lprobs[:, self.eos] = 1
+            # Record attention scores, only support avg_attn_scores is a Tensor
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = torch.empty(
+                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
+                    ).to(scores)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+            scores = scores.type_as(lprobs)
+            eos_bbsz_idx = torch.empty(0).to(
+                tokens
+            )  # indices of hypothesis ending with eos (finished sentences)
+            eos_scores = torch.empty(0).to(
+                scores
+            )  # scores of hypothesis ending with eos (finished sentences)
+            if self.should_set_src_lengths:
+                self.search.set_src_lengths(src_lengths)
+            if self.repeat_ngram_blocker is not None:
+                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step)
+            # Shape: (batch, cand_size)
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step],
+                tokens[:, : step + 1],
+                original_batch_idxs,
+            )
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+            # finalize hypotheses that end in eos
+            # Shape of eos_mask: (batch size, beam size)
+            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
+            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
+            # only consider eos when it's among the top beam_size indices
+            # Now we know what beam item(s) to finish
+            # Shape: 1d list of absolute-numbered
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+            finalized_sents: List[int] = []
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.masked_select(
+                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
+                )
+                finalized_sents = self.finalize_hypos(
+                    step,
+                    eos_bbsz_idx,
+                    eos_scores,
+                    tokens,
+                    scores,
+                    finalized,
+                    finished,
+                    beam_size,
+                    attn,
+                    src_lengths,
+                    max_len,
+                )
+                num_remaining_sent -= len(finalized_sents)
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            if self.search.stop_on_max_len and step >= max_len:
+                break
+            assert step < max_len, f"{step} < {max_len}"
+            # Remove finalized sentences (ones for which {beam_size}
+            # finished hypotheses have been generated) from the batch.
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = torch.ones(
+                    bsz, dtype=torch.bool, device=cand_indices.device
+                )
+                batch_mask[finalized_sents] = False
+                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
+                batch_idxs = torch.arange(
+                    bsz, device=cand_indices.device
+                ).masked_select(batch_mask)
+                # Choose the subset of the hypothesized constraints that will continue
+                self.search.prune_sentences(batch_idxs)
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+                cand_scores = cand_scores[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths = src_lengths[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, attn.size(1), -1
+                    )
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+            # Set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+            # Rewrite the operator since the element wise or is not supported in torchscript.
+            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+            # get the top beam_size active hypotheses, which are just
+            # the hypos with the smallest values in active_mask.
+            # {active_hypos} indicates which {beam_size} hypotheses
+            # from the list of {2 * beam_size} candidates were
+            # selected. Shapes: (batch size, beam size)
+            new_cands_to_ignore, active_hypos = torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False
+            )
+            # update cands_to_ignore to ignore any finalized hypos.
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            # Make sure there is at least one active item for each sentence in the batch.
+            assert (~cands_to_ignore).any(dim=1).all()
+            # update cands_to_ignore to ignore any finalized hypos
+            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
+            # can be selected more than once).
+            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
+            active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+            # copy tokens and scores for active hypotheses
+            # Set the tokens for each beam (can select the same row more than once)
+            tokens[:, : step + 1] = torch.index_select(
+                tokens[:, : step + 1], dim=0, index=active_bbsz_idx
+            )
+            # Select the next token for each of them
+            tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
+                cand_indices, dim=1, index=active_hypos
+            )
+            if step > 0:
+                scores[:, :step] = torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx
+                )
+            scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
+                cand_scores, dim=1, index=active_hypos
+            )
+            # Update constraints based on which candidates were selected for the next beam
+            self.search.update_constraints(active_hypos)
+            # copy attention for active hypotheses
+            if attn is not None:
+                attn[:, :, : step + 2] = torch.index_select(
+                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
+                )
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+        # sort by score descending
+        for sent in range(len(finalized)):
+            scores = torch.tensor(
+                [float(elem["score"].item()) for elem in finalized[sent]]
+            )
+            _, sorted_scores_indices = torch.sort(scores, descending=True)
+            finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices]
+            finalized[sent] = torch.jit.annotate(
+                List[Dict[str, Tensor]], finalized[sent]
+            )
+        return finalized
+    def _prefix_tokens(
+        self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int
+    ):
+        """Handle prefix tokens"""
+        prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+        prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+        prefix_mask = prefix_toks.ne(self.pad)
+        if self.constraint_trie is None:
+            lprobs[prefix_mask] = torch.min(prefix_lprobs) - 1
+        else:
+            lprobs[prefix_mask] = -math.inf
+        lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
+            -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask]
+        )
+        # if prefix includes eos, then we should make sure tokens and
+        # scores are the same across all beams
+        eos_mask = prefix_toks.eq(self.eos)
+        if eos_mask.any():
+            # validate that the first beam matches the prefix
+            first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[
+                :, 0, 1 : step + 1
+            ]
+            eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+            target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+            assert (first_beam == target_prefix).all()
+            # copy tokens, scores and lprobs from the first beam to all beams
+            tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size)
+            scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size)
+            lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size)
+        return lprobs, tokens, scores
+    def replicate_first_beam(self, tensor, mask, beam_size: int):
+        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+        tensor[mask] = tensor[mask][:, :1, :]
+        return tensor.view(-1, tensor.size(-1))
+    def finalize_hypos(
+        self,
+        step: int,
+        bbsz_idx,
+        eos_scores,
+        tokens,
+        scores,
+        finalized: List[List[Dict[str, Tensor]]],
+        finished: List[bool],
+        beam_size: int,
+        attn: Optional[Tensor],
+        src_lengths,
+        max_len: int,
+    ):
+        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
+        A sentence is finalized when {beam_size} finished items have been collected for it.
+        Returns number of sentences (not beam items) being finalized.
+        These will be removed from the batch and not processed further.
+        Args:
+            bbsz_idx (Tensor):
+        """
+        assert bbsz_idx.numel() == eos_scores.numel()
+        # clone relevant token and attention tensors.
+        # tokens is (batch * beam, max_len). So the index_select
+        # gets the newly EOS rows, then selects cols 1..{step + 2}
+        tokens_clone = tokens.index_select(0, bbsz_idx)[
+            :, 1 : step + 2
+        ]  # skip the first index, which is EOS
+        tokens_clone[:, step] = self.eos
+        attn_clone = (
+            attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2]
+            if attn is not None
+            else None
+        )
+        # compute scores per token position
+        pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1]
+        pos_scores[:, step] = eos_scores
+        # convert from cumulative to per-position scores
+        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
+        # normalize sentence-level scores
+        if self.normalize_scores:
+            eos_scores /= (step + 1) ** self.len_penalty
+        # cum_unfin records which sentences in the batch are finished.
+        # It helps match indexing between (a) the original sentences
+        # in the batch and (b) the current, possibly-reduced set of
+        # sentences.
+        cum_unfin: List[int] = []
+        prev = 0
+        for f in finished:
+            if f:
+                prev += 1
+            else:
+                cum_unfin.append(prev)
+        cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)
+        unfin_idx = bbsz_idx // beam_size
+        sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)
+        # Create a set of "{sent}{unfin_idx}", where
+        # "unfin_idx" is the index in the current (possibly reduced)
+        # list of sentences, and "sent" is the index in the original,
+        # unreduced batch
+        # For every finished beam item
+        # sentence index in the current (possibly reduced) batch
+        seen = (sent << 32) + unfin_idx
+        unique_seen: List[int] = torch.unique(seen).tolist()
+        if self.match_source_len:
+            condition = step > torch.index_select(src_lengths, 0, unfin_idx)
+            eos_scores = torch.where(condition, torch.tensor(-math.inf), eos_scores)
+        sent_list: List[int] = sent.tolist()
+        for i in range(bbsz_idx.size()[0]):
+            # An input sentence (among those in a batch) is finished when
+            # beam_size hypotheses have been collected for it
+            if len(finalized[sent_list[i]]) < beam_size:
+                if attn_clone is not None:
+                    # remove padding tokens from attn scores
+                    hypo_attn = attn_clone[i]
+                else:
+                    hypo_attn = torch.empty(0)
+                finalized[sent_list[i]].append(
+                    {
+                        "tokens": tokens_clone[i],
+                        "score": eos_scores[i],
+                        "attention": hypo_attn,  # src_len x tgt_len
+                        "alignment": torch.empty(0),
+                        "positional_scores": pos_scores[i],
+                    }
+                )
+        newly_finished: List[int] = []
+        for unique_s in unique_seen:
+            # check termination conditions for this sentence
+            unique_sent: int = unique_s >> 32
+            unique_unfin_idx: int = unique_s - (unique_sent << 32)
+            if not finished[unique_sent] and self.is_finished(
+                step, unique_unfin_idx, max_len, len(finalized[unique_sent]), beam_size
+            ):
+                finished[unique_sent] = True
+                newly_finished.append(unique_unfin_idx)
+        return newly_finished
+    def is_finished(
+        self,
+        step: int,
+        unfin_idx: int,
+        max_len: int,
+        finalized_sent_len: int,
+        beam_size: int,
+    ):
+        """
+        Check whether decoding for a sentence is finished, which
+        occurs when the list of finalized sentences has reached the
+        beam size, or when we reach the maximum length.
+        """
+        assert finalized_sent_len <= beam_size
+        if finalized_sent_len == beam_size or step == max_len:
+            return True
+        return False
+class EnsembleModel(nn.Module):
+    """A wrapper around an ensemble of models."""
+    def __init__(self, models):
+        super().__init__()
+        self.models_size = len(models)
+        # method '__len__' is not supported in ModuleList for torch script
+        self.single_model = models[0]
+        self.models = nn.ModuleList(models)
+        self.has_incremental: bool = False
+        if all(
+            hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
+            for m in models
+        ):
+            self.has_incremental = True
+    def forward(self):
+        pass
+    def has_encoder(self):
+        return hasattr(self.single_model, "encoder")
+    def has_incremental_states(self):
+        return self.has_incremental
+    def max_decoder_positions(self):
+        return min([m.max_decoder_positions() for m in self.models if hasattr(m, "max_decoder_positions")] + [sys.maxsize])
+    @torch.jit.export
+    def forward_encoder(self, net_input: Dict[str, Tensor]):
+        if not self.has_encoder():
+            return None
+        return [model.encoder.forward_torchscript(net_input) for model in self.models]
+    @torch.jit.export
+    def forward_decoder(
+        self,
+        tokens,
+        encoder_outs: List[Dict[str, List[Tensor]]],
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        temperature: float = 1.0,
+        constraint_trie=None,
+        constraint_start=None,
+        constraint_end=None,
+        gen_code=False,
+        zero_shot=False,
+        prefix_tokens=None
+    ):
+        log_probs = []
+        avg_attn: Optional[Tensor] = None
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None
+        code_mask = (tokens.new_ones(tokens.size(0))*gen_code).bool()
+        for i, model in enumerate(self.models):
+            if self.has_encoder():
+                encoder_out = encoder_outs[i]
+            # decode each model
+            if self.has_incremental_states():
+                decoder_out = model.decoder.forward(
+                    tokens,
+                    code_masks=code_mask,
+                    encoder_out=encoder_out,
+                    incremental_state=incremental_states[i],
+                )
+            else:
+                if hasattr(model, "decoder"):
+                    decoder_out = model.decoder.forward(tokens, code_masks=code_mask, encoder_out=encoder_out)
+                else:
+                    decoder_out = model.forward(tokens)
+            attn: Optional[Tensor] = None
+            decoder_len = len(decoder_out)
+            if decoder_len > 1 and decoder_out[1] is not None:
+                if isinstance(decoder_out[1], Tensor):
+                    attn = decoder_out[1]
+                else:
+                    attn_holder = decoder_out[1]["attn"]
+                    if isinstance(attn_holder, Tensor):
+                        attn = attn_holder
+                    elif attn_holder is not None:
+                        attn = attn_holder[0]
+                if attn is not None:
+                    attn = attn[:, -1, :]
+            decoder_out_tuple = (
+                decoder_out[0][:, -1:, :].div_(temperature),
+                None if decoder_len <= 1 else decoder_out[1],
+            )
+            beam_size = decoder_out_tuple[0].size(0) // prefix_tokens.size(0) if prefix_tokens is not None else 0
+            if constraint_trie is not None and not zero_shot:
+                assert constraint_start is None and constraint_end is None
+                constraint_masks = decoder_out_tuple[0].new_zeros(decoder_out_tuple[0].size()).bool()
+                constraint_prefix_tokens = tokens.tolist()
+                for token_index, constraint_prefix_token in enumerate(constraint_prefix_tokens):
+                    prefix_len = prefix_tokens[token_index // beam_size].ne(1).sum().item() if prefix_tokens is not None else 0
+                    if len(constraint_prefix_token) > prefix_len:
+                        constraint_prefix_token = [0] + constraint_prefix_token[prefix_len+1:]
+                        constraint_nodes = constraint_trie.get_next_layer(constraint_prefix_token)
+                        constraint_masks[token_index][:, constraint_nodes] = True
+                    else:
+                        constraint_masks[token_index] = True
+                decoder_out_tuple[0].masked_fill_(~constraint_masks, -math.inf)
+            if constraint_start is not None and constraint_end is not None and not zero_shot:
+                assert constraint_trie is None
+                decoder_out_tuple[0][:, :, 4:constraint_start] = -math.inf
+                decoder_out_tuple[0][:, :, constraint_end:] = -math.inf
+            probs = model.get_normalized_probs(
+                decoder_out_tuple, log_probs=True, sample=None
+            )
+            if constraint_trie is not None and zero_shot:
+                assert constraint_start is None and constraint_end is None
+                constraint_masks = decoder_out_tuple[0].new_zeros(decoder_out_tuple[0].size()).bool()
+                constraint_prefix_tokens = tokens.tolist()
+                for token_index, constraint_prefix_token in enumerate(constraint_prefix_tokens):
+                    constraint_nodes = constraint_trie.get_next_layer(constraint_prefix_token)
+                    constraint_masks[token_index][:, constraint_nodes] = True
+                probs.masked_fill_(~constraint_masks, -math.inf)
+            if constraint_start is not None and constraint_end is not None and zero_shot:
+                assert constraint_trie is None
+                probs[:, :, 4:constraint_start] = -math.inf
+                probs[:, :, constraint_end:] = -math.inf
+            probs = probs[:, -1, :]
+            if self.models_size == 1:
+                return probs, attn
+            log_probs.append(probs)
+            if attn is not None:
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(
+            self.models_size
+        )
+        if avg_attn is not None:
+            avg_attn.div_(self.models_size)
+        return avg_probs, avg_attn
+    @torch.jit.export
+    def reorder_encoder_out(
+        self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order
+    ):
+        """
+        Reorder encoder output according to *new_order*.
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        new_outs: List[Dict[str, List[Tensor]]] = []
+        if not self.has_encoder():
+            return new_outs
+        for i, model in enumerate(self.models):
+            assert encoder_outs is not None
+            new_outs.append(
+                model.encoder.reorder_encoder_out(encoder_outs[i], new_order)
+            )
+        return new_outs
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+    ):
+        if not self.has_incremental_states():
+            return
+        for i, model in enumerate(self.models):
+            model.decoder.reorder_incremental_state_scripting(
+                incremental_states[i], new_order
+            )
+class SequenceGeneratorWithAlignment(SequenceGenerator):
+    def __init__(
+        self, models, tgt_dict, left_pad_target=False, print_alignment="hard", **kwargs
+    ):
+        """Generates translations of a given source sentence.
+        Produces alignments following "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        Args:
+            left_pad_target (bool, optional): Whether or not the
+                hypothesis should be left padded or not when they are
+                teacher forced for generating alignments.
+        """
+        super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs)
+        self.left_pad_target = left_pad_target
+        if print_alignment == "hard":
+            self.extract_alignment = utils.extract_hard_alignment
+        elif print_alignment == "soft":
+            self.extract_alignment = utils.extract_soft_alignment
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        finalized = super()._generate(sample, **kwargs)
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        beam_size = self.beam_size
+        (
+            src_tokens,
+            src_lengths,
+            prev_output_tokens,
+            tgt_tokens,
+        ) = self._prepare_batch_for_alignment(sample, finalized)
+        if any(getattr(m, "full_context_alignment", False) for m in self.model.models):
+            attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens)
+        else:
+            attn = [
+                finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0)
+                for i in range(bsz * beam_size)
+            ]
+        if src_tokens.device != "cpu":
+            src_tokens = src_tokens.to("cpu")
+            tgt_tokens = tgt_tokens.to("cpu")
+            attn = [i.to("cpu") for i in attn]
+        # Process the attn matrix to extract hard alignments.
+        for i in range(bsz * beam_size):
+            alignment = self.extract_alignment(
+                attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos
+            )
+            finalized[i // beam_size][i % beam_size]["alignment"] = alignment
+        return finalized
+    def _prepare_batch_for_alignment(self, sample, hypothesis):
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        src_tokens = (
+            src_tokens[:, None, :]
+            .expand(-1, self.beam_size, -1)
+            .contiguous()
+            .view(bsz * self.beam_size, -1)
+        )
+        src_lengths = sample["net_input"]["src_lengths"]
+        src_lengths = (
+            src_lengths[:, None]
+            .expand(-1, self.beam_size)
+            .contiguous()
+            .view(bsz * self.beam_size)
+        )
+        prev_output_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=True,
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=False,
+        )
+        return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
+class EnsembleModelWithAlignment(EnsembleModel):
+    """A wrapper around an ensemble of models."""
+    def __init__(self, models):
+        super().__init__(models)
+    def forward_align(self, src_tokens, src_lengths, prev_output_tokens):
+        avg_attn = None
+        for model in self.models:
+            decoder_out = model(src_tokens, src_lengths, prev_output_tokens)
+            attn = decoder_out[1]["attn"][0]
+            if avg_attn is None:
+                avg_attn = attn
+            else:
+                avg_attn.add_(attn)
+        if len(self.models) > 1:
+            avg_attn.div_(len(self.models))
+        return avg_attn

ofa_module/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import data
+import models
+import tasks
+import criterions
+import utils

run_scripts/caption/coco_eval.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+import sys
+import os.path as op
+from pycocotools.coco import COCO
+from pycocoevalcap.eval import COCOEvalCap
+def evaluate_on_coco_caption(res_file, label_file, outfile=None):
+    """
+    res_file: txt file, each row is [image_key, json format list of captions].
+             Each caption is a dict, with fields "caption", "conf".
+    label_file: JSON file of ground truth captions in COCO format.
+    """
+    coco = COCO(label_file)
+    cocoRes = coco.loadRes(res_file)
+    cocoEval = COCOEvalCap(coco, cocoRes)
+    # evaluate on a subset of images by setting
+    # cocoEval.params['image_id'] = cocoRes.getImgIds()
+    # please remove this line when evaluating the full validation set
+    cocoEval.params['image_id'] = cocoRes.getImgIds()
+    # evaluate results
+    # SPICE will take a few minutes the first time, but speeds up due to caching
+    cocoEval.evaluate()
+    result = cocoEval.eval
+    if not outfile:
+        print(result)
+    else:
+        with open(outfile, 'w') as fp:
+            json.dump(result, fp, indent=4)
+    return result
+if __name__ == "__main__":
+    if len(sys.argv) == 3:
+        evaluate_on_coco_caption(sys.argv[1], sys.argv[2])
+    elif len(sys.argv) == 4:
+        evaluate_on_coco_caption(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        raise NotImplementedError

run_scripts/caption/evaluate_caption.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env bash
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1081
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export GPUS_PER_NODE=8
+user_dir=../../ofa_module
+bpe_dir=../../utils/BPE
+data=../../dataset/caption_data/caption_test.tsv
+path=../../checkpoints/caption_large_best_clean.pt
+result_path=../../results/caption
+selected_cols=1,4,2
+split='test'
+python3 -m torch.distributed.launch --nproc_per_node=${GPUS_PER_NODE} --master_port=${MASTER_PORT} ../../evaluate.py \
+    ${data} \
+    --path=${path} \
+    --user-dir=${user_dir} \
+    --task=caption \
+    --batch-size=16 \
+    --log-format=simple --log-interval=10 \
+    --seed=7 \
+    --gen-subset=${split} \
+    --results-path=${result_path} \
+    --beam=5 \
+    --max-len-b=16 \
+    --no-repeat-ngram-size=3 \
+    --fp16 \
+    --num-workers=0 \
+    --model-overrides="{\"data\":\"${data}\",\"bpe_dir\":\"${bpe_dir}\",\"eval_cider\":False,\"selected_cols\":\"${selected_cols}\"}"
+python coco_eval.py ../../results/caption/test_predict.json ../../dataset/caption_data/test_caption_coco_format.json

run_scripts/caption/evaluate_caption_base.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1091
+user_dir=../../ofa_module
+bpe_dir=../../utils/BPE
+data=../../dataset/caption_data/caption_test.tsv
+path=../../checkpoints/caption_base_best.pt
+result_path=../../results/caption
+selected_cols=1,4,2
+split='test'
+CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m torch.distributed.launch --nproc_per_node=4 --master_port=${MASTER_PORT} ../../evaluate.py \
+    ${data} \
+    --path=${path} \
+    --user-dir=${user_dir} \
+    --task=caption \
+    --batch-size=16 \
+    --log-format=simple --log-interval=10 \
+    --seed=7 \
+    --gen-subset=${split} \
+    --results-path=${result_path} \
+    --beam=5 \
+    --max-len-b=16 \
+    --no-repeat-ngram-size=3 \
+    --fp16 \
+    --num-workers=0 \
+    --model-overrides="{\"data\":\"${data}\",\"bpe_dir\":\"${bpe_dir}\",\"eval_cider\":False,\"selected_cols\":\"${selected_cols}\"}"
+python coco_eval.py ../../results/caption/test_predict.json ../../dataset/caption_data/test_caption_coco_format.json

run_scripts/caption/train_caption_stage1.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1051
+log_dir=./stage1_logs
+save_dir=./stage1_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage1_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/ofa_large.pt
+selected_cols=0,4,2
+task=caption
+arch=ofa_large
+criterion=adjust_label_smoothed_cross_entropy
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=8
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.1
+decoder_drop_path_rate=0.1
+dropout=0.1
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+drop_worst_ratio=0.2
+for max_epoch in {2,}; do
+  echo "max_epoch "${max_epoch}
+  for warmup_ratio in {0.06,}; do
+    echo "warmup_ratio "${warmup_ratio}
+    for drop_worst_after in {2500,}; do
+      echo "drop_worst_after "${drop_worst_after}
+      log_file=${log_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}".log"
+      save_path=${save_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}
+      mkdir -p $save_path
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --master_port=${MASTER_PORT} ../../train.py \
+          $data \
+          --selected-cols=${selected_cols} \
+          --bpe-dir=${bpe_dir} \
+          --user-dir=${user_dir} \
+          --restore-file=${restore_file} \
+          --reset-optimizer --reset-dataloader --reset-meters \
+          --save-dir=${save_path} \
+          --task=${task} \
+          --arch=${arch} \
+          --criterion=${criterion} \
+          --label-smoothing=${label_smoothing} \
+          --batch-size=${batch_size} \
+          --update-freq=${update_freq} \
+          --encoder-normalize-before \
+          --decoder-normalize-before \
+          --share-decoder-input-output-embed \
+          --share-all-embeddings \
+          --layernorm-embedding \
+          --patch-layernorm-embedding \
+          --code-layernorm-embedding \
+          --resnet-drop-path-rate=${resnet_drop_path_rate} \
+          --encoder-drop-path-rate=${encoder_drop_path_rate} \
+          --decoder-drop-path-rate=${decoder_drop_path_rate} \
+          --dropout=${dropout} \
+          --attention-dropout=${attention_dropout} \
+          --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+          --lr-scheduler=polynomial_decay --lr=${lr} \
+          --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+          --log-format=simple --log-interval=10 \
+          --fixed-validation-seed=7 \
+          --no-epoch-checkpoints --keep-best-checkpoints=1 \
+          --save-interval=1 --validate-interval=1 \
+          --save-interval-updates=500 --validate-interval-updates=500 \
+          --eval-cider \
+          --eval-cider-cached-tokens=${eval_cider_cached} \
+          --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+          --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+          --max-src-length=${max_src_length} \
+          --max-tgt-length=${max_tgt_length} \
+          --find-unused-parameters \
+          --freeze-encoder-embedding \
+          --freeze-decoder-embedding \
+          --add-type-embedding \
+          --scale-attn \
+          --scale-fc \
+          --scale-heads \
+          --disable-entangle \
+          --num-bins=${num_bins} \
+          --patch-image-size=${patch_image_size} \
+          --drop-worst-ratio=${drop_worst_ratio} \
+          --drop-worst-after=${drop_worst_after} \
+          --fp16 \
+          --fp16-scale-window=512 \
+          --num-workers=0 > ${log_file} 2>&1
+    done
+  done
+done

run_scripts/caption/train_caption_stage1_base.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1061
+log_dir=./stage1_logs
+save_dir=./stage1_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage1_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/ofa_base.pt
+selected_cols=0,4,2
+task=caption
+arch=ofa_base
+criterion=adjust_label_smoothed_cross_entropy
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=8
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.1
+decoder_drop_path_rate=0.1
+dropout=0.1
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+drop_worst_ratio=0.2
+for max_epoch in {5,}; do
+  echo "max_epoch "${max_epoch}
+  for warmup_ratio in {0.06,}; do
+    echo "warmup_ratio "${warmup_ratio}
+    for drop_worst_after in {6000,}; do
+      echo "drop_worst_after "${drop_worst_after}
+      log_file=${log_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}".log"
+      save_path=${save_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}
+      mkdir -p $save_path
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --master_port=${MASTER_PORT} ../../train.py \
+          $data \
+          --selected-cols=${selected_cols} \
+          --bpe-dir=${bpe_dir} \
+          --user-dir=${user_dir} \
+          --restore-file=${restore_file} \
+          --reset-optimizer --reset-dataloader --reset-meters \
+          --save-dir=${save_path} \
+          --task=${task} \
+          --arch=${arch} \
+          --criterion=${criterion} \
+          --label-smoothing=${label_smoothing} \
+          --batch-size=${batch_size} \
+          --update-freq=${update_freq} \
+          --encoder-normalize-before \
+          --decoder-normalize-before \
+          --share-decoder-input-output-embed \
+          --share-all-embeddings \
+          --layernorm-embedding \
+          --patch-layernorm-embedding \
+          --code-layernorm-embedding \
+          --resnet-drop-path-rate=${resnet_drop_path_rate} \
+          --encoder-drop-path-rate=${encoder_drop_path_rate} \
+          --decoder-drop-path-rate=${decoder_drop_path_rate} \
+          --dropout=${dropout} \
+          --attention-dropout=${attention_dropout} \
+          --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+          --lr-scheduler=polynomial_decay --lr=${lr} \
+          --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+          --log-format=simple --log-interval=10 \
+          --fixed-validation-seed=7 \
+          --no-epoch-checkpoints --keep-best-checkpoints=1 \
+          --save-interval=1 --validate-interval=1 \
+          --save-interval-updates=500 --validate-interval-updates=500 \
+          --eval-cider \
+          --eval-cider-cached-tokens=${eval_cider_cached} \
+          --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+          --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+          --max-src-length=${max_src_length} \
+          --max-tgt-length=${max_tgt_length} \
+          --find-unused-parameters \
+          --freeze-encoder-embedding \
+          --freeze-decoder-embedding \
+          --add-type-embedding \
+          --scale-attn \
+          --scale-fc \
+          --scale-heads \
+          --disable-entangle \
+          --num-bins=${num_bins} \
+          --patch-image-size=${patch_image_size} \
+          --drop-worst-ratio=${drop_worst_ratio} \
+          --drop-worst-after=${drop_worst_after} \
+          --fp16 \
+          --fp16-scale-window=512 \
+          --num-workers=0 > ${log_file} 2>&1
+    done
+  done
+done

run_scripts/caption/train_caption_stage1_el.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1051
+log_dir=./stage1_logs
+save_dir=./stage1_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage1_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/ofa_large.pt
+selected_cols=0,4,2
+task=caption
+arch=ofa_large
+criterion=adjust_label_smoothed_encouraging_loss # for el
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=8
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.1
+decoder_drop_path_rate=0.1
+dropout=0.1
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+drop_worst_ratio=0.05 # modified from 0.2 for el
+log_end=0.75  # for el
+for max_epoch in {2,}; do
+  echo "max_epoch "${max_epoch}
+  for warmup_ratio in {0.06,}; do
+    echo "warmup_ratio "${warmup_ratio}
+    for drop_worst_after in {2500,}; do
+      echo "drop_worst_after "${drop_worst_after}
+      log_file=${log_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}_el${log_end}_".log"
+      save_path=${save_dir}/${max_epoch}"_"${warmup_ratio}"_"${drop_worst_after}_el${log_end}_
+      mkdir -p $save_path
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --master_port=${MASTER_PORT} ../../train.py \
+          $data \
+          --selected-cols=${selected_cols} \
+          --bpe-dir=${bpe_dir} \
+          --user-dir=${user_dir} \
+          --restore-file=${restore_file} \
+          --reset-optimizer --reset-dataloader --reset-meters \
+          --save-dir=${save_path} \
+          --task=${task} \
+          --arch=${arch} \
+          --criterion=${criterion} \
+          --label-smoothing=${label_smoothing} \
+          --batch-size=${batch_size} \
+          --update-freq=${update_freq} \
+          --encoder-normalize-before \
+          --decoder-normalize-before \
+          --share-decoder-input-output-embed \
+          --share-all-embeddings \
+          --layernorm-embedding \
+          --patch-layernorm-embedding \
+          --code-layernorm-embedding \
+          --resnet-drop-path-rate=${resnet_drop_path_rate} \
+          --encoder-drop-path-rate=${encoder_drop_path_rate} \
+          --decoder-drop-path-rate=${decoder_drop_path_rate} \
+          --dropout=${dropout} \
+          --attention-dropout=${attention_dropout} \
+          --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+          --lr-scheduler=polynomial_decay --lr=${lr} \
+          --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+          --log-format=simple --log-interval=10 \
+          --fixed-validation-seed=7 \
+          --no-epoch-checkpoints --keep-best-checkpoints=1 \
+          --save-interval=1 --validate-interval=1 \
+          --save-interval-updates=500 --validate-interval-updates=500 \
+          --eval-cider \
+          --eval-cider-cached-tokens=${eval_cider_cached} \
+          --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+          --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+          --max-src-length=${max_src_length} \
+          --max-tgt-length=${max_tgt_length} \
+          --find-unused-parameters \
+          --freeze-encoder-embedding \
+          --freeze-decoder-embedding \
+          --add-type-embedding \
+          --scale-attn \
+          --scale-fc \
+          --scale-heads \
+          --disable-entangle \
+          --num-bins=${num_bins} \
+          --patch-image-size=${patch_image_size} \
+          --drop-worst-ratio=${drop_worst_ratio} \
+          --drop-worst-after=${drop_worst_after} \
+          --log-end ${log_end} \
+          --fp16 \
+          --fp16-scale-window=512 \
+          --num-workers=0 > ${log_file} 2>&1
+    done
+  done
+done

run_scripts/caption/train_caption_stage1_el_db.sh ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1051
+log_dir=./stage1_logs
+save_dir=./stage1_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage1_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/ofa_large.pt
+selected_cols=0,4,2
+task=caption
+arch=ofa_large
+criterion=adjust_label_smoothed_encouraging_loss # for el
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=8
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.1
+decoder_drop_path_rate=0.1
+dropout=0.1
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+drop_worst_ratio=0.05 # modified from 0.2 for el
+drop_best_ratio=0.05
+drop_best_after=2500
+log_end=0.75  # for el
+for max_epoch in {2,}; do
+  echo "max_epoch "${max_epoch}
+  for warmup_ratio in {0.06,}; do
+    echo "warmup_ratio "${warmup_ratio}
+    for drop_worst_after in {2500,}; do
+      echo "drop_worst_after "${drop_worst_after}
+      log_file=${log_dir}/${max_epoch}"_"${warmup_ratio}"_dwdb"${drop_worst_after}_el${log_end}_".log"
+      save_path=${save_dir}/${max_epoch}"_"${warmup_ratio}"_dwdb"${drop_worst_after}_el${log_end}_
+      mkdir -p $save_path
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --master_port=${MASTER_PORT} ../../train.py \
+          $data \
+          --selected-cols=${selected_cols} \
+          --bpe-dir=${bpe_dir} \
+          --user-dir=${user_dir} \
+          --restore-file=${restore_file} \
+          --reset-optimizer --reset-dataloader --reset-meters \
+          --save-dir=${save_path} \
+          --task=${task} \
+          --arch=${arch} \
+          --criterion=${criterion} \
+          --label-smoothing=${label_smoothing} \
+          --batch-size=${batch_size} \
+          --update-freq=${update_freq} \
+          --encoder-normalize-before \
+          --decoder-normalize-before \
+          --share-decoder-input-output-embed \
+          --share-all-embeddings \
+          --layernorm-embedding \
+          --patch-layernorm-embedding \
+          --code-layernorm-embedding \
+          --resnet-drop-path-rate=${resnet_drop_path_rate} \
+          --encoder-drop-path-rate=${encoder_drop_path_rate} \
+          --decoder-drop-path-rate=${decoder_drop_path_rate} \
+          --dropout=${dropout} \
+          --attention-dropout=${attention_dropout} \
+          --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+          --lr-scheduler=polynomial_decay --lr=${lr} \
+          --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+          --log-format=simple --log-interval=10 \
+          --fixed-validation-seed=7 \
+          --no-epoch-checkpoints --keep-best-checkpoints=1 \
+          --save-interval=1 --validate-interval=1 \
+          --save-interval-updates=500 --validate-interval-updates=500 \
+          --eval-cider \
+          --eval-cider-cached-tokens=${eval_cider_cached} \
+          --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+          --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+          --max-src-length=${max_src_length} \
+          --max-tgt-length=${max_tgt_length} \
+          --find-unused-parameters \
+          --freeze-encoder-embedding \
+          --freeze-decoder-embedding \
+          --add-type-embedding \
+          --scale-attn \
+          --scale-fc \
+          --scale-heads \
+          --disable-entangle \
+          --num-bins=${num_bins} \
+          --patch-image-size=${patch_image_size} \
+          --drop-worst-ratio=${drop_worst_ratio} \
+          --drop-worst-after=${drop_worst_after} \
+          --log-end ${log_end} --drop-best-ratio ${drop_best_ratio} --drop-best-after ${drop_best_after} \
+          --fp16 \
+          --fp16-scale-window=512 \
+          --num-workers=0 > ${log_file} 2>&1
+    done
+  done
+done

run_scripts/caption/train_caption_stage2.sh ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1052
+log_dir=./stage2_logs
+save_dir=./stage2_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage2_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/caption_stage1_best.pt
+selected_cols=1,4,2
+task=caption
+arch=ofa_large
+criterion=scst_reward_criterion
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=2
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.0
+decoder_drop_path_rate=0.0
+dropout=0.0
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+scst_cider_cached=${data_dir}/cider_cached_tokens/coco-train-words.p
+for lr in {1e-5,}; do
+  echo "lr "${lr}
+  for max_epoch in {3,}; do
+    echo "max_epoch "${max_epoch}
+    log_file=${log_dir}/${lr}"_"${max_epoch}".log"
+    save_path=${save_dir}/${lr}"_"${max_epoch}
+    mkdir -p $save_path
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch --nproc_per_node=8 --master_port=${MASTER_PORT} ../../train.py \
+        $data \
+        --selected-cols=${selected_cols} \
+        --bpe-dir=${bpe_dir} \
+        --user-dir=${user_dir} \
+        --restore-file=${restore_file} \
+        --reset-optimizer --reset-dataloader --reset-meters \
+        --save-dir=${save_path} \
+        --task=${task} \
+        --arch=${arch} \
+        --criterion=${criterion} \
+        --batch-size=${batch_size} \
+        --update-freq=${update_freq} \
+        --encoder-normalize-before \
+        --decoder-normalize-before \
+        --share-decoder-input-output-embed \
+        --share-all-embeddings \
+        --layernorm-embedding \
+        --patch-layernorm-embedding \
+        --code-layernorm-embedding \
+        --resnet-drop-path-rate=${resnet_drop_path_rate} \
+        --encoder-drop-path-rate=${encoder_drop_path_rate} \
+        --decoder-drop-path-rate=${decoder_drop_path_rate} \
+        --dropout=${dropout} \
+        --attention-dropout=${attention_dropout} \
+        --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+        --lr-scheduler=polynomial_decay --lr=${lr} --end-learning-rate=2e-7 \
+        --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+        --log-format=simple --log-interval=10 \
+        --fixed-validation-seed=7 \
+        --no-epoch-checkpoints --keep-best-checkpoints=1 \
+        --save-interval=1 --validate-interval=1 \
+        --save-interval-updates=500 --validate-interval-updates=500 \
+        --eval-cider \
+        --eval-cider-cached-tokens=${eval_cider_cached} \
+        --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+        --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+        --max-src-length=${max_src_length} \
+        --max-tgt-length=${max_tgt_length} \
+        --find-unused-parameters \
+        --freeze-encoder-embedding \
+        --freeze-decoder-embedding \
+        --add-type-embedding \
+        --scale-attn \
+        --scale-fc \
+        --scale-heads \
+        --disable-entangle \
+        --num-bins=${num_bins} \
+        --patch-image-size=${patch_image_size} \
+        --scst \
+        --scst-cider-cached-tokens=${scst_cider_cached} \
+        --scst-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+        --memory-efficient-fp16 \
+        --fp16-scale-window=512 \
+        --num-workers=0 > ${log_file} 2>&1
+  done
+done

run_scripts/caption/train_caption_stage2_base.sh ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env
+# The port for communication. Note that if you want to run multiple tasks on the same machine,
+# you need to specify different port numbers.
+export MASTER_PORT=1062
+log_dir=./stage2_logs
+save_dir=./stage2_checkpoints
+mkdir -p $log_dir $save_dir
+bpe_dir=../../utils/BPE
+user_dir=../../ofa_module
+data_dir=../../dataset/caption_data
+data=${data_dir}/caption_stage2_train.tsv,${data_dir}/caption_val.tsv
+restore_file=../../checkpoints/caption_stage1_base_best.pt
+selected_cols=1,4,2
+task=caption
+arch=ofa_base
+criterion=scst_reward_criterion
+label_smoothing=0.1
+lr=1e-5
+max_epoch=5
+warmup_ratio=0.06
+batch_size=2
+update_freq=4
+resnet_drop_path_rate=0.0
+encoder_drop_path_rate=0.0
+decoder_drop_path_rate=0.0
+dropout=0.0
+attention_dropout=0.0
+max_src_length=80
+max_tgt_length=20
+num_bins=1000
+patch_image_size=480
+eval_cider_cached=${data_dir}/cider_cached_tokens/coco-valid-words.p
+scst_cider_cached=${data_dir}/cider_cached_tokens/coco-train-words.p
+for lr in {1e-5,}; do
+  echo "lr "${lr}
+  for max_epoch in {3,}; do
+    echo "max_epoch "${max_epoch}
+    log_file=${log_dir}/${lr}"_"${max_epoch}".log"
+    save_path=${save_dir}/${lr}"_"${max_epoch}
+    mkdir -p $save_path
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch --nproc_per_node=8 --master_port=${MASTER_PORT} ../../train.py \
+        $data \
+        --selected-cols=${selected_cols} \
+        --bpe-dir=${bpe_dir} \
+        --user-dir=${user_dir} \
+        --restore-file=${restore_file} \
+        --reset-optimizer --reset-dataloader --reset-meters \
+        --save-dir=${save_path} \
+        --task=${task} \
+        --arch=${arch} \
+        --criterion=${criterion} \
+        --batch-size=${batch_size} \
+        --update-freq=${update_freq} \
+        --encoder-normalize-before \
+        --decoder-normalize-before \
+        --share-decoder-input-output-embed \
+        --share-all-embeddings \
+        --layernorm-embedding \
+        --patch-layernorm-embedding \
+        --code-layernorm-embedding \
+        --resnet-drop-path-rate=${resnet_drop_path_rate} \
+        --encoder-drop-path-rate=${encoder_drop_path_rate} \
+        --decoder-drop-path-rate=${decoder_drop_path_rate} \
+        --dropout=${dropout} \
+        --attention-dropout=${attention_dropout} \
+        --weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
+        --lr-scheduler=polynomial_decay --lr=${lr} \
+        --max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
+        --log-format=simple --log-interval=10 \
+        --fixed-validation-seed=7 \
+        --no-epoch-checkpoints --keep-best-checkpoints=1 \
+        --save-interval=1 --validate-interval=1 \
+        --save-interval-updates=500 --validate-interval-updates=500 \
+        --eval-cider \
+        --eval-cider-cached-tokens=${eval_cider_cached} \
+        --eval-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+        --best-checkpoint-metric=cider --maximize-best-checkpoint-metric \
+        --max-src-length=${max_src_length} \
+        --max-tgt-length=${max_tgt_length} \
+        --find-unused-parameters \
+        --freeze-encoder-embedding \
+        --freeze-decoder-embedding \
+        --add-type-embedding \
+        --scale-attn \
+        --scale-fc \
+        --scale-heads \
+        --disable-entangle \
+        --num-bins=${num_bins} \
+        --patch-image-size=${patch_image_size} \
+        --scst \
+        --scst-cider-cached-tokens=${scst_cider_cached} \
+        --scst-args='{"beam":5,"max_len_b":16,"no_repeat_ngram_size":3}' \
+        --memory-efficient-fp16 \
+        --fp16-scale-window=512 \
+        --num-workers=0 > ${log_file} 2>&1
+  done
+done

tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .cv_tasks import *
+from .mm_tasks import *
+from .nlg_tasks import *
+from .nlu_tasks import *
+from .pretrain_tasks import *
+from .ofa_task import OFATask

tasks/mm_tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .caption import CaptionTask
+from .image_gen import ImageGenTask
+from .refcoco import RefcocoTask
+from .snli_ve import SnliVeTask
+from .vqa_gen import VqaGenTask

tasks/mm_tasks/caption.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from dataclasses import dataclass, field
+import json
+import logging
+from typing import Optional
+from argparse import Namespace
+from itertools import zip_longest
+from collections import OrderedDict
+import numpy as np
+import sacrebleu
+import string
+from fairseq import metrics, utils
+from fairseq.tasks import register_task
+from tasks.ofa_task import OFATask, OFAConfig
+from data.mm_data.caption_dataset import CaptionDataset
+from data.file_dataset import FileDataset
+from utils.cider.pyciderevalcap.ciderD.ciderD import CiderD
+EVAL_BLEU_ORDER = 4
+logger = logging.getLogger(__name__)
+@dataclass
+class CaptionConfig(OFAConfig):
+    eval_bleu: bool = field(
+        default=False, metadata={"help": "evaluation with BLEU scores"}
+    )
+    eval_cider: bool = field(
+        default=False, metadata={"help": "evaluation with CIDEr scores"}
+    )
+    eval_args: Optional[str] = field(
+        default='{}',
+        metadata={
+            "help": 'generation args for BLUE or CIDEr scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
+        },
+    )
+    eval_print_samples: bool = field(
+        default=False, metadata={"help": "print sample generations during validation"}
+    )
+    eval_cider_cached_tokens: Optional[str] = field(
+        default=None,
+        metadata={"help": "path to cached cPickle file used to calculate CIDEr scores"},
+    )
+    scst: bool = field(
+        default=False, metadata={"help": "Self-critical sequence training"}
+    )
+    scst_args: str = field(
+        default='{}',
+        metadata={
+            "help": 'generation args for Self-critical sequence training, as JSON string'
+        },
+    )
+@register_task("caption", dataclass=CaptionConfig)
+class CaptionTask(OFATask):
+    def __init__(self, cfg: CaptionConfig, src_dict, tgt_dict):
+        super().__init__(cfg, src_dict, tgt_dict)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        paths = self.cfg.data.split(',')
+        assert len(paths) > 0
+        if split == 'train':
+            file_path = paths[(epoch - 1) % (len(paths) - 1)]
+        else:
+            file_path = paths[-1]
+        dataset = FileDataset(file_path, self.cfg.selected_cols)
+        self.datasets[split] = CaptionDataset(
+            split,
+            dataset,
+            self.bpe,
+            self.src_dict,
+            self.tgt_dict,
+            max_src_length=self.cfg.max_src_length,
+            max_tgt_length=self.cfg.max_tgt_length,
+            patch_image_size=self.cfg.patch_image_size,
+            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
+            scst=getattr(self.cfg, 'scst', False)
+        )
+    def build_model(self, cfg):
+        model = super().build_model(cfg)
+        if self.cfg.eval_bleu or self.cfg.eval_cider:
+            gen_args = json.loads(self.cfg.eval_args)
+            self.sequence_generator = self.build_generator(
+                [model], Namespace(**gen_args)
+            )
+            if self.cfg.eval_cider:
+                self.CiderD_scorer = CiderD(df=self.cfg.eval_cider_cached_tokens)
+        if self.cfg.scst:
+            scst_args = json.loads(self.cfg.scst_args)
+            self.scst_generator = self.build_generator(
+                [model], Namespace(**scst_args)
+            )
+        return model
+    def _calculate_cider_scores(self, gen_res, gt_res):
+        '''
+        gen_res: generated captions, list of str
+        gt_idx: list of int, of the same length as gen_res
+        gt_res: ground truth captions, list of list of str.
+            gen_res[i] corresponds to gt_res[gt_idx[i]]
+            Each image can have multiple ground truth captions
+        '''
+        gen_res_size = len(gen_res)
+        res = OrderedDict()
+        for i in range(gen_res_size):
+            res[i] = [gen_res[i].strip()]
+        gts = OrderedDict()
+        gt_res_ = [
+            [gt_res[i][j].strip() for j in range(len(gt_res[i]))]
+            for i in range(len(gt_res))
+        ]
+        for i in range(gen_res_size):
+            gts[i] = gt_res_[i]
+        res_ = [{'image_id': i, 'caption': res[i]} for i in range(len(res))]
+        _, scores = self.CiderD_scorer.compute_score(gts, res_)
+        return scores
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = criterion(model, sample)
+        model.eval()
+        if self.cfg.eval_bleu or self.cfg.eval_cider:
+            hyps, refs = self._inference(self.sequence_generator, sample, model)
+            if self.cfg.eval_bleu:
+                if self.cfg.eval_tokenized_bleu:
+                    bleu = sacrebleu.corpus_bleu(hyps, list(zip_longest(*refs)), tokenize="none")
+                else:
+                    bleu = sacrebleu.corpus_bleu(hyps, list(zip_longest(*refs)))
+                logging_output["_bleu_sys_len"] = bleu.sys_len
+                logging_output["_bleu_ref_len"] = bleu.ref_len
+                # we split counts into separate entries so that they can be
+                # summed efficiently across workers using fast-stat-sync
+                assert len(bleu.counts) == EVAL_BLEU_ORDER
+                for i in range(EVAL_BLEU_ORDER):
+                    logging_output["_bleu_counts_" + str(i)] = bleu.counts[i]
+                    logging_output["_bleu_totals_" + str(i)] = bleu.totals[i]
+            if self.cfg.eval_cider:
+                scores = self._calculate_cider_scores(hyps, refs)
+                logging_output["_cider_score_sum"] = scores.sum()
+                logging_output["_cider_cnt"] = scores.size
+        return loss, sample_size, logging_output
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        def sum_logs(key):
+            import torch
+            result = sum(log.get(key, 0) for log in logging_outputs)
+            if torch.is_tensor(result):
+                result = result.cpu()
+            return result
+        if self.cfg.eval_bleu:
+            counts, totals = [], []
+            for i in range(EVAL_BLEU_ORDER):
+                counts.append(sum_logs("_bleu_counts_" + str(i)))
+                totals.append(sum_logs("_bleu_totals_" + str(i)))
+            if max(totals) > 0:
+                # log counts as numpy arrays -- log_scalar will sum them correctly
+                metrics.log_scalar("_bleu_counts", np.array(counts))
+                metrics.log_scalar("_bleu_totals", np.array(totals))
+                metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len"))
+                metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len"))
+                def compute_bleu(meters):
+                    import inspect
+                    import sacrebleu
+                    fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0]
+                    if "smooth_method" in fn_sig:
+                        smooth = {"smooth_method": "exp"}
+                    else:
+                        smooth = {"smooth": "exp"}
+                    bleu = sacrebleu.compute_bleu(
+                        correct=meters["_bleu_counts"].sum,
+                        total=meters["_bleu_totals"].sum,
+                        sys_len=meters["_bleu_sys_len"].sum,
+                        ref_len=meters["_bleu_ref_len"].sum,
+                        **smooth
+                    )
+                    return round(bleu.score, 2)
+                metrics.log_derived("bleu", compute_bleu)
+        if self.cfg.eval_cider:
+            def compute_cider(meters):
+                cider = meters["_cider_score_sum"].sum / meters["_cider_cnt"].sum
+                cider = cider if isinstance(cider, float) else cider.item()
+                return round(cider, 3)
+            if sum_logs("_cider_cnt") > 0:
+                metrics.log_scalar("_cider_score_sum", sum_logs("_cider_score_sum"))
+                metrics.log_scalar("_cider_cnt", sum_logs("_cider_cnt"))
+                metrics.log_derived("cider", compute_cider)
+    def _inference(self, generator, sample, model):
+        def decode(toks, escape_unk=False):
+            s = self.tgt_dict.string(
+                toks.int().cpu(),
+                # The default unknown string in fairseq is `<unk>`, but
+                # this is tokenized by sacrebleu as `< unk >`, inflating
+                # BLEU scores. Instead, we use a somewhat more verbose
+                # alternative that is unlikely to appear in the real
+                # reference, but doesn't get split into multiple tokens.
+                unk_string=("UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"),
+            )
+            if self.bpe:
+                s = self.bpe.decode(s)
+            return s
+        gen_out = self.inference_step(generator, [model], sample)
+        hyps, refs = [], []
+        transtab = str.maketrans({key: None for key in string.punctuation})
+        for i in range(len(gen_out)):
+            decode_tokens = decode(gen_out[i][0]["tokens"])
+            hyps.append(decode_tokens.translate(transtab).strip())
+            refs.append(
+                [
+                    sent.translate(transtab).strip()
+                    for sent in decode(
+                        utils.strip_pad(sample["target"][i], self.tgt_dict.pad()),
+                        escape_unk=True,  # don't count <unk> as matches to the hypo
+                    ).split('&&')
+                ]
+            )
+        if self.cfg.eval_print_samples:
+            logger.info("example hypothesis: " + hyps[0])
+            logger.info("example reference: " + ' && '.join(refs[0]))
+        return hyps, refs

tasks/mm_tasks/image_gen.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from dataclasses import dataclass, field
+import json
+import logging
+import os
+import math
+import base64
+from typing import Optional
+from argparse import Namespace
+from omegaconf import DictConfig, OmegaConf
+from torchvision import transforms
+from PIL import Image
+from io import BytesIO
+import torch
+import numpy as np
+from fairseq import metrics
+from fairseq.tasks import register_task
+from fairseq.dataclass import ChoiceEnum
+from models import search, clip
+from models.taming.models.vqgan import GumbelVQ
+from data.mm_data.image_gen_dataset import ImageGenDataset
+from data.file_dataset import FileDataset
+from tasks.ofa_task import OFATask, OFAConfig
+logger = logging.getLogger(__name__)
+def custom_to_pil(x):
+    x = x.detach().cpu()
+    x = torch.clamp(x, -1., 1.)
+    x = (x + 1.) / 2.
+    x = x.permute(1, 2, 0).numpy()
+    x = (255 * x).astype(np.uint8)
+    x = Image.fromarray(x)
+    if not x.mode == "RGB":
+        x = x.convert("RGB")
+    return x
+EVAL_CLIP_METHOD = ChoiceEnum(["ii_sim", "ti_sim"])
+@dataclass
+class ImageGenConfig(OFAConfig):
+    sampling_times: int = field(
+        default=1, metadata={"help": "sample times"}
+    )
+    code_image_size: int = field(
+        default=256, metadata={"help": "code image size"}
+    )
+    # options for reporting CLIP score during validation
+    eval_clip_method: EVAL_CLIP_METHOD = field(
+        default='ti_sim',
+        metadata={
+            "help": "evaluation with CLIP scores. ii_sim means Similarity between generated Images and ref Images, ti_sim means Similarity between generated Images and input Text"}
+    )
+    eval_args: Optional[str] = field(
+        default='{}',
+        metadata={
+            "help": 'generation args for clip scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
+        },
+    )
+    scst: bool = field(
+        default=False, metadata={"help": "Self-critical sequence training"}
+    )
+    scst_args: str = field(
+        default='{}',
+        metadata={
+            "help": 'generation args for Self-critical sequence training, as JSON string'
+        },
+    )
+    vqgan_model_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "path of vqgan model"}
+    )
+    vqgan_config_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "path of vqgan config"}
+    )
+    clip_model_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "clip model path"}
+    )
+    gen_images_path: str = field(
+        default='', metadata={"help": "where to store generated images during evalution. Don't dump images if None. "}
+    )
+@register_task("image_gen", dataclass=ImageGenConfig)
+class ImageGenTask(OFATask):
+    def __init__(self, cfg: ImageGenConfig, src_dict, tgt_dict):
+        super().__init__(cfg, src_dict, tgt_dict)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        paths = self.cfg.data.split(',')
+        assert len(paths) > 0
+        if split == 'train':
+            file_path = paths[(epoch - 1) % (len(paths) - 1)]
+        else:
+            file_path = paths[-1]
+        dataset = FileDataset(file_path, self.cfg.selected_cols)
+        self.datasets[split] = ImageGenDataset(
+            split,
+            dataset,
+            self.bpe,
+            self.src_dict,
+            self.tgt_dict,
+            max_src_length=self.cfg.max_src_length,
+            code_dict_size=self.cfg.code_dict_size,
+            code_image_size=self.cfg.code_image_size
+        )
+    def build_model(self, cfg):
+        model = super().build_model(cfg)
+        device = torch.cuda.current_device()
+        clip_model, clip_preprocess = clip.load(self.cfg.clip_model_path, device=device)
+        self.clip_model = clip_model
+        self.clip_preprocess = clip_preprocess
+        self.clip_model.to(device)
+        self.clip_model.eval()
+        vqgan_config = OmegaConf.load(self.cfg.vqgan_config_path)
+        vqgan = GumbelVQ(**vqgan_config.model.params)
+        sd = torch.load(self.cfg.vqgan_model_path, map_location="cpu")["state_dict"]
+        missing, unexpected = vqgan.load_state_dict(sd, strict=False)
+        for k, v in vqgan.named_parameters():
+            v.requires_grad = False
+        self.image_tokenizer = vqgan
+        self.image_tokenizer.to(device)
+        self.image_tokenizer.eval()
+        gen_args = json.loads(self.cfg.eval_args)
+        self.sequence_generator = self.build_generator(
+            [model], Namespace(**gen_args)
+        )
+        if self.cfg.scst:
+            scst_args = json.loads(self.cfg.scst_args)
+            self.scst_generator = self.build_generator(
+                [model], Namespace(**scst_args)
+            )
+        return model
+    def build_generator(
+            self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
+    ):
+        """
+        Build a :class:`~fairseq.SequenceGenerator` instance for this
+        task.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            args (fairseq.dataclass.configs.GenerationConfig):
+                configuration object (dataclass) for generation
+            extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass
+                through to SequenceGenerator
+            prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]):
+                If provided, this function constrains the beam search to
+                allowed tokens only at each step. The provided function
+                should take 2 arguments: the batch ID (`batch_id: int`)
+                and a unidimensional tensor of token ids (`inputs_ids:
+                torch.Tensor`). It has to return a `List[int]` with the
+                allowed tokens for the next generation step conditioned
+                on the previously generated tokens (`inputs_ids`) and
+                the batch ID (`batch_id`). This argument is useful for
+                constrained generation conditioned on the prefix, as
+                described in "Autoregressive Entity Retrieval"
+                (https://arxiv.org/abs/2010.00904) and
+                https://github.com/facebookresearch/GENRE.
+        """
+        from models.sequence_generator import SequenceGenerator
+        # Choose search strategy. Defaults to Sampling.
+        self.sampling_times = self.cfg.sampling_times
+        sampling = True  # we have to use sampling instead of beam search in image generation task
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
+        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
+        search_strategy = search.Sampling(
+            self.target_dictionary, sampling_topk, sampling_topp
+        )
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+        return SequenceGenerator(
+            models,
+            self.target_dictionary,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            constraint_range=self.cfg.constraint_range,
+            gen_code=True,
+            **extra_gen_cls_kwargs,
+        )
+    def compute_ref_image_similarity(self, hyps, ref, device):
+        hyp_images = torch.stack(
+            [self.clip_preprocess(hyp_image) for hyp_image in hyps], dim=0
+        ).to(device)
+        ref_images = self.clip_preprocess(ref).unsqueeze(0).to(device)
+        with torch.no_grad():
+            hyp_image_features = self.clip_model.encode_image(hyp_images)
+            ref_image_features = self.clip_model.encode_image(ref_images)
+        hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
+        ref_image_features /= ref_image_features.norm(dim=-1, keepdim=True)
+        similarity = hyp_image_features @ ref_image_features.T
+        # scores.append(similarity.max().item())
+        sorted_score, indices = torch.sort(similarity.view(-1), descending=True)
+        return sorted_score, indices
+    def compute_text_similarity(self, hyps, text, device):
+        hyp_images = torch.stack(
+            [self.clip_preprocess(hyp_image) for hyp_image in hyps], dim=0
+        ).to(device)
+        clip_input = clip.tokenize([text]).to(device)
+        with torch.no_grad():
+            hyp_image_features = self.clip_model.encode_image(hyp_images)
+            hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
+            text_features = self.clip_model.encode_text(clip_input)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+        ti_similarity = hyp_image_features @ text_features.T
+        sorted_score, indices = torch.sort(ti_similarity.view(-1), descending=True)
+        return sorted_score, indices
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = criterion(model, sample)
+        model.eval()
+        device = sample['target'].device
+        hyps, ref = self.inference_image(self.sequence_generator, sample, [model])
+        scores = []
+        tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
+        caption = self.bpe.decode(self.tgt_dict.string([token for token in tokens if token >= 4]))[
+                  38:].replace('/', '')
+        if self.cfg.eval_clip_method == 'ii_sim':
+            similarity_score, indices = self.compute_ref_image_similarity(hyps, ref, device)
+        elif self.cfg.eval_clip_method == 'ti_sim':
+            similarity_score, indices = self.compute_text_similarity(hyps, caption, device)
+        else:
+            raise ValueError("unsupported eval method.")
+        scores.append(similarity_score.max().item())
+        sorted_hyps = [hyps[indice] for indice in indices]
+        if self.cfg.gen_images_path:
+            caption_tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
+            caption = self.bpe.decode(self.tgt_dict.string([token for token in caption_tokens if token >= 4]))[
+                      38:].replace('/', '')
+            self.dump_images(sorted_hyps, text=caption, path=os.path.join(self.cfg.gen_images_path, 'all_results'))
+            self.dump_images(sorted_hyps, text=caption, path=os.path.join(self.cfg.gen_images_path, 'top1'), topk=1)
+        logging_output["_score_sum"] = sum(scores)
+        logging_output["_score_cnt"] = len(scores)
+        return loss, sample_size, logging_output
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        def sum_logs(key):
+            import torch
+            result = sum(log.get(key, 0) for log in logging_outputs)
+            if torch.is_tensor(result):
+                result = result.cpu()
+            return result
+        def compute_score(meters):
+            score = meters["_score_sum"].sum / meters["_score_cnt"].sum
+            score = score if isinstance(score, float) else score.item()
+            return round(score, 3)
+        if sum_logs("_score_cnt") > 0:
+            metrics.log_scalar("_score_sum", sum_logs("_score_sum"))
+            metrics.log_scalar("_score_cnt", sum_logs("_score_cnt"))
+            metrics.log_derived("score", compute_score)
+    def inference_image(self, generator, sample, models):
+        hyps, ref = [], None
+        for j in range(self.sampling_times):
+            gen_out = self.inference_step(generator, models, sample)
+            for i in range(len(gen_out)):
+                with torch.no_grad():
+                    tokens = torch.stack([item['tokens'][:-1] for item in gen_out[i]], dim=0)
+                    tokens += -len(self.src_dict) + self.cfg.code_dict_size + self.cfg.num_bins
+                    images = self.image_tokenizer.decode_code(
+                        tokens.view(-1, self.cfg.code_image_size // 8, self.cfg.code_image_size // 8)
+                    )
+                    images = [custom_to_pil(image) for image in images]
+                hyps += images
+        if 'code_images' in sample:
+            ref = Image.open(BytesIO(base64.urlsafe_b64decode(sample['code_images'][0]))).convert('RGB')
+        return hyps, ref
+    def dump_images(self, images, text, path, topk=None):
+        os.makedirs(path, exist_ok=True)
+        if topk:
+            images = images[:topk]
+        for j, image in enumerate(images):
+            save_path = os.path.join(path, f'{text}_{j}.png')
+            image.save(save_path)

tasks/mm_tasks/refcoco.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from dataclasses import dataclass, field
+import json
+import logging
+from typing import Optional
+from argparse import Namespace
+import torch
+from fairseq import metrics
+from fairseq.tasks import register_task
+from tasks.ofa_task import OFATask, OFAConfig
+from data.mm_data.refcoco_dataset import RefcocoDataset
+from data.file_dataset import FileDataset
+logger = logging.getLogger(__name__)
+@dataclass
+class RefcocoConfig(OFAConfig):
+    eval_acc: bool = field(
+        default=False, metadata={"help": "evaluation with accuracy"}
+    )
+    eval_args: Optional[str] = field(
+        default='{}',
+        metadata={
+            "help": 'generation args, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
+        },
+    )
+    eval_print_samples: bool = field(
+        default=False, metadata={"help": "print sample generations during validation"}
+    )
+    max_image_size: int = field(
+        default=512, metadata={"help": "max image size for normalization"}
+    )
+    scst: bool = field(
+        default=False, metadata={"help": "Self-critical sequence training"}
+    )
+    scst_args: str = field(
+        default='{}',
+        metadata={
+            "help": 'generation args for Self-critical sequence training, as JSON string'
+        },
+    )
+@register_task("refcoco", dataclass=RefcocoConfig)
+class RefcocoTask(OFATask):
+    def __init__(self, cfg: RefcocoConfig, src_dict, tgt_dict):
+        super().__init__(cfg, src_dict, tgt_dict)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        paths = self.cfg.data.split(',')
+        assert len(paths) > 0
+        if split == 'train':
+            file_path = paths[(epoch - 1) % (len(paths) - 1)]
+        else:
+            file_path = paths[-1]
+        dataset = FileDataset(file_path, self.cfg.selected_cols)
+        self.datasets[split] = RefcocoDataset(
+            split,
+            dataset,
+            self.bpe,
+            self.src_dict,
+            self.tgt_dict,
+            max_src_length=self.cfg.max_src_length,
+            max_tgt_length=self.cfg.max_tgt_length,
+            patch_image_size=self.cfg.patch_image_size,
+            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
+            num_bins=self.cfg.num_bins,
+            max_image_size=self.cfg.max_image_size
+        )
+    def build_model(self, cfg):
+        model = super().build_model(cfg)
+        if self.cfg.eval_acc:
+            gen_args = json.loads(self.cfg.eval_args)
+            self.sequence_generator = self.build_generator(
+                [model], Namespace(**gen_args)
+            )
+        if self.cfg.scst:
+            scst_args = json.loads(self.cfg.scst_args)
+            self.scst_generator = self.build_generator(
+                [model], Namespace(**scst_args)
+            )
+        return model
+    def _calculate_ap_score(self, hyps, refs, thresh=0.5):
+        interacts = torch.cat(
+            [torch.where(hyps[:, :2] < refs[:, :2], refs[:, :2], hyps[:, :2]),
+             torch.where(hyps[:, 2:] < refs[:, 2:], hyps[:, 2:], refs[:, 2:])],
+            dim=1
+        )
+        area_predictions = (hyps[:, 2] - hyps[:, 0]) * (hyps[:, 3] - hyps[:, 1])
+        area_targets = (refs[:, 2] - refs[:, 0]) * (refs[:, 3] - refs[:, 1])
+        interacts_w = interacts[:, 2] - interacts[:, 0]
+        interacts_h = interacts[:, 3] - interacts[:, 1]
+        area_interacts = interacts_w * interacts_h
+        ious = area_interacts / (area_predictions + area_targets - area_interacts + 1e-6)
+        return ((ious >= thresh) & (interacts_w > 0) & (interacts_h > 0)).float()
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = criterion(model, sample)
+        model.eval()
+        if self.cfg.eval_acc:
+            hyps, refs = self._inference(self.sequence_generator, sample, model)
+            hyps = hyps / (self.cfg.num_bins - 1) * self.cfg.max_image_size
+            refs = refs / (self.cfg.num_bins - 1) * self.cfg.max_image_size
+            hyps[:, ::2] /= sample['w_resize_ratios'].unsqueeze(1)
+            hyps[:, 1::2] /= sample['h_resize_ratios'].unsqueeze(1)
+            refs[:, ::2] /= sample['w_resize_ratios'].unsqueeze(1)
+            refs[:, 1::2] /= sample['h_resize_ratios'].unsqueeze(1)
+            # scores = self._calculate_ap_score(hyps, refs)
+            scores = self._calculate_ap_score(hyps, sample['region_coords'].float())
+            logging_output["_score_sum"] = scores.sum().item()
+            logging_output["_score_cnt"] = scores.size(0)
+        return loss, sample_size, logging_output
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        def sum_logs(key):
+            import torch
+            result = sum(log.get(key, 0) for log in logging_outputs)
+            if torch.is_tensor(result):
+                result = result.cpu()
+            return result
+        def compute_score(meters):
+            score = meters["_score_sum"].sum / meters["_score_cnt"].sum
+            score = score if isinstance(score, float) else score.item()
+            return round(score, 4)
+        if sum_logs("_score_cnt") > 0:
+            metrics.log_scalar("_score_sum", sum_logs("_score_sum"))
+            metrics.log_scalar("_score_cnt", sum_logs("_score_cnt"))
+            metrics.log_derived("score", compute_score)
+    def _inference(self, generator, sample, model):
+        gen_out = self.inference_step(generator, [model], sample)
+        hyps, refs = [], []
+        for i in range(len(gen_out)):
+            hyps.append(gen_out[i][0]["tokens"][:-1] - len(self.src_dict) + self.cfg.num_bins)
+            refs.append(sample["target"][i][:-1] - len(self.src_dict) + self.cfg.num_bins)
+        if self.cfg.eval_print_samples:
+            logger.info("example hypothesis: ", hyps[0])
+            logger.info("example reference: ", refs[0])
+        return torch.stack(hyps, dim=0), torch.stack(refs, dim=0)

tasks/mm_tasks/snli_ve.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import json
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+from fairseq import metrics
+from fairseq.tasks import register_task
+from tasks.ofa_task import OFAConfig, OFATask
+from data.mm_data.snli_ve_dataset import SnliVeDataset
+from data.file_dataset import FileDataset
+from data import data_utils
+from utils.trie import Trie
+logger = logging.getLogger(__name__)
+@dataclass
+class SnliVeConfig(OFAConfig):
+    ans2label_dict: Optional[str] = field(
+        default='{"no": 0, "yes":1, "maybe": 2}',
+        metadata={"help": 'answer to label dict'},
+    )
+    add_caption: bool = field(
+        default=False,
+        metadata={"help": "add caption to encoder"},
+    )
+    valid_batch_size: int = field(
+        default=20,
+        metadata={"help": "valid batch size per step"},
+    )
+    prompt_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "prompt_type"},
+    )
+@register_task("snli_ve", dataclass=SnliVeConfig)
+class SnliVeTask(OFATask):
+    def __init__(self, cfg: SnliVeConfig, src_dict, tgt_dict):
+        super().__init__(cfg, src_dict, tgt_dict)
+        self.ans2label_dict = json.loads(self.cfg.ans2label_dict)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        paths = self.cfg.data.split(',')
+        assert len(paths) > 0
+        if split == 'train':
+            file_path = paths[(epoch - 1) % (len(paths) - 1)]
+        else:
+            file_path = paths[-1]
+        dataset = FileDataset(file_path, self.cfg.selected_cols)
+        self.datasets[split] = SnliVeDataset(
+            split,
+            dataset,
+            self.bpe,
+            self.src_dict,
+            self.tgt_dict,
+            max_src_length=self.cfg.max_src_length,
+            max_tgt_length=self.cfg.max_tgt_length,
+            patch_image_size=self.cfg.patch_image_size,
+            add_caption=self.cfg.add_caption,
+            constraint_trie=self.constraint_trie,
+            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
+            prompt_type=self.cfg.prompt_type
+        )
+    def build_model(self, cfg):
+        model = super().build_model(cfg)
+        answer_item_list = []
+        self.index2ans = {}
+        self.constraint_trie = Trie(self.tgt_dict.eos())
+        for i, answer in enumerate(self.ans2label_dict.keys()):
+            answer_item = self.tgt_dict.encode_line(
+                line=self.bpe.encode(' ' + answer),
+                add_if_not_exist=False,
+                append_eos=False
+            ).long()
+            answer_item_list.append(answer_item)
+            self.index2ans[i] = answer
+            self.constraint_trie.insert([self.tgt_dict.bos()] + answer_item.tolist() + [self.tgt_dict.eos()])
+        constraint_mask_list = []
+        for answer_item in answer_item_list:
+            constraint_mask = torch.zeros((len(answer_item)+1, len(self.tgt_dict))).bool()
+            for i in range(len(answer_item)+1):
+                constraint_prefix_token = [self.src_dict.bos()] + answer_item[:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            constraint_mask_list.append(constraint_mask)
+        self.valid_answers_list = []
+        self.valid_constraint_masks_list = []
+        for i in range(0, len(answer_item_list), self.cfg.valid_batch_size):
+            self.valid_answers_list += [answer_item_list[i:i+self.cfg.valid_batch_size]]
+            self.valid_constraint_masks_list += [constraint_mask_list[i:i+self.cfg.valid_batch_size]]
+        return model
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
+    ):
+        seq_generator = super().build_generator(models, args, seq_gen_cls, extra_gen_cls_kwargs, prefix_allowed_tokens_fn)
+        seq_generator.constraint_trie = self.constraint_trie
+        return seq_generator
+    def valid_step(self, sample, model, criterion, **extra_kwargs):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        model.eval()
+        with torch.no_grad():
+            encoder_out = model.encoder(
+                sample["net_input"]["src_tokens"],
+                src_lengths=sample["net_input"]["src_lengths"],
+                patch_images=sample["net_input"]["patch_images"],
+                patch_masks=sample["net_input"]["patch_masks"]
+            )
+            device = sample["net_input"]["src_tokens"].device
+            eos_item = torch.tensor([self.src_dict.eos()])
+            pad = self.src_dict.pad()
+            valid_result = []
+            for valid_answers, valid_constraint_masks in zip(self.valid_answers_list, self.valid_constraint_masks_list):
+                valid_size = len(valid_answers)
+                valid_tgt_items = [
+                    torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
+                    for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+                ]
+                valid_prev_items = [
+                    torch.cat([torch.tensor(decoder_prompt), valid_answer])
+                    for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+                ]
+                valid_constraint_mask_items = [
+                    torch.cat([torch.zeros(len(decoder_prompt)-1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask], dim=0)
+                    for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
+                ]
+                valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad, left_pad=False).to(device)
+                valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad, left_pad=False).to(device)
+                valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad, left_pad=False).to(device)
+                new_encoder_out = {}
+                new_encoder_out["encoder_out"] = [
+                    encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
+                ]
+                new_encoder_out["encoder_padding_mask"] = [
+                    encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
+                ]
+                new_encoder_out["position_embeddings"] = [
+                    encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
+                ]
+                decoder_out = model.decoder(valid_prev_output, encoder_out=new_encoder_out)
+                decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
+                lprobs = model.get_normalized_probs(decoder_out, log_probs=True)
+                scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+                scores = scores.masked_fill(valid_tgt.eq(self.tgt_dict.pad()), 0)
+                scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
+                scores = scores.sum(1)
+                scores = scores.view(-1, valid_size)
+                valid_result.append(scores)
+        valid_result = torch.cat(valid_result, dim=-1)
+        predicts = valid_result.argmax(1).tolist()
+        hyps = [self.index2ans[predict_index] for predict_index in predicts]
+        scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
+        logging_output["_snli_score_sum"] = sum(scores)
+        logging_output["_snli_cnt"] = len(scores)
+        return loss, sample_size, logging_output
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        def sum_logs(key):
+            import torch
+            result = sum(log.get(key, 0) for log in logging_outputs)
+            if torch.is_tensor(result):
+                result = result.cpu()
+            return result
+        def compute_score(meters):
+            score = meters["_snli_score_sum"].sum / meters["_snli_cnt"].sum
+            score = score if isinstance(score, float) else score.item()
+            return round(score, 4)
+        if sum_logs("_snli_cnt") > 0:
+            metrics.log_scalar("_snli_score_sum", sum_logs("_snli_score_sum"))
+            metrics.log_scalar("_snli_cnt", sum_logs("_snli_cnt"))
+            metrics.log_derived("snli_score", compute_score)

tasks/mm_tasks/vqa_gen.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from dataclasses import dataclass, field
+import json
+import logging
+import os
+import math
+import pickle
+from typing import Optional
+from argparse import Namespace
+from data.file_dataset import FileDataset
+import torch
+from fairseq import metrics
+from fairseq.tasks import register_task
+from models import search
+from data.mm_data.vqa_gen_dataset import VqaGenDataset
+from data import data_utils
+from tasks.ofa_task import OFAConfig, OFATask
+from utils.trie import Trie
+logger = logging.getLogger(__name__)
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.bos, generator.eos}
+def decode_fn(x, tgt_dict, bpe, generator, tokenizer=None):
+    x = tgt_dict.string(x.int().cpu(), extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator))
+    if bpe is not None:
+        x = bpe.decode(x)
+    if tokenizer is not None:
+        x = tokenizer.decode(x)
+    return x
+@dataclass
+class VqaGenConfig(OFAConfig):
+    max_object_length: int = field(
+        default=30, metadata={"help": "the maximum object sequence length"}
+    )
+    ans2label_dict: Optional[str] = field(
+        default='{"no": 0, "yes":1}',
+        metadata={"help": 'answer to label dict'},
+    )
+    ans2label_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "path to load ans2label file"},
+    )
+    add_object: bool = field(
+        default=False,
+        metadata={"help": "add object to encoder"},
+    )
+    valid_batch_size: int = field(
+        default=20,
+        metadata={"help": "valid batch size per step"},
+    )
+    prompt_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "prompt_type"},
+    )
+    uses_ema: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to use ema"},
+    )
+    val_inference_type: Optional[str] = field(
+        default='allcand',
+        metadata={"help": "inference type in validation (allcand or beamsearch), default to allcand"},
+    )
+    eval_args: Optional[str] = field(
+        default='{"beam":5,"unnormalized":true,"temperature":1.0}',
+        metadata={
+            "help": 'generation args as JSON string for inference, only activated when --val-inference-type=beamsearch'
+        },
+    )
+@register_task("vqa_gen", dataclass=VqaGenConfig)
+class VqaGenTask(OFATask):
+    def __init__(self, cfg: VqaGenConfig, src_dict, tgt_dict):
+        super().__init__(cfg, src_dict, tgt_dict)
+        self.ans2label_dict = None
+        if self.cfg.ans2label_file is not None:
+            self.ans2label_dict = pickle.load(open(self.cfg.ans2label_file, "rb"))
+        else:
+            self.ans2label_dict = json.loads(self.cfg.ans2label_dict)
+        self.uses_ema = self.cfg.uses_ema
+        assert self.cfg.val_inference_type in ["allcand", "beamsearch"], \
+            "Unknown inference type encountered: {}, should be allcand or beamsearch.".format(self.cfg.val_inference_type)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        paths = self.cfg.data.split(',')
+        assert len(paths) > 0
+        if split == 'train':
+            table_path = paths[(epoch - 1) % (len(paths) - 1)]
+        else:
+            table_path = paths[-1]
+        dataset = FileDataset(table_path, self.cfg.selected_cols)
+        self.datasets[split] = VqaGenDataset(
+            split,
+            dataset,
+            self.bpe,
+            self.src_dict,
+            self.tgt_dict,
+            max_src_length=self.cfg.max_src_length,
+            max_object_length=self.cfg.max_object_length,
+            max_tgt_length=self.cfg.max_tgt_length,
+            patch_image_size=self.cfg.patch_image_size,
+            add_object=self.cfg.add_object,
+            constraint_trie=self.constraint_trie,
+            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
+            prompt_type=self.cfg.prompt_type
+        )
+    def build_model(self, cfg):
+        model = super().build_model(cfg)
+        answer_item_list = []
+        self.index2ans = {}
+        self.constraint_trie = Trie(self.tgt_dict.eos())
+        for i, answer in enumerate(self.ans2label_dict.keys()):
+            answer_item = self.tgt_dict.encode_line(
+                line=self.bpe.encode(' ' + answer),
+                add_if_not_exist=False,
+                append_eos=False
+            ).long()
+            answer_item_list.append(answer_item)
+            self.index2ans[i] = answer
+            self.constraint_trie.insert([self.tgt_dict.bos()] + answer_item.tolist() + [self.tgt_dict.eos()])
+        constraint_mask_list = []
+        for answer_item in answer_item_list:
+            constraint_mask = torch.zeros((len(answer_item)+1, len(self.tgt_dict))).bool()
+            for i in range(len(answer_item)+1):
+                constraint_prefix_token = [self.src_dict.bos()] + answer_item[:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            constraint_mask_list.append(constraint_mask)
+        if self.cfg.val_inference_type == "allcand":
+            self.valid_answers_list = []
+            self.valid_constraint_masks_list = []
+            for i in range(0, len(answer_item_list), self.cfg.valid_batch_size):
+                self.valid_answers_list += [answer_item_list[i:i+self.cfg.valid_batch_size]]
+                self.valid_constraint_masks_list += [constraint_mask_list[i:i+self.cfg.valid_batch_size]]
+        elif self.cfg.val_inference_type == "beamsearch":
+            gen_args = json.loads(self.cfg.eval_args)
+            self.generator = self.build_generator(
+                [model], Namespace(**gen_args)
+            )
+        else:
+            raise NotImplementedError("Error: Unknown inference type encountered.")
+        return model
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
+    ):
+        seq_generator = super().build_generator(models, args, seq_gen_cls, extra_gen_cls_kwargs, prefix_allowed_tokens_fn)
+        seq_generator.constraint_trie = self.constraint_trie
+        return seq_generator
+    def valid_step(self, sample, model, criterion, **extra_kwargs):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        if self.uses_ema:
+            assert 'ema_model' in extra_kwargs and extra_kwargs['ema_model'] is not None
+        if self.uses_ema:
+            eval_model = extra_kwargs['ema_model']
+        else:
+            eval_model = model
+        eval_model.eval()
+        with torch.no_grad():
+            if self.cfg.val_inference_type == "allcand":
+                encoder_out = eval_model.encoder(
+                    sample["net_input"]["src_tokens"],
+                    src_lengths=sample["net_input"]["src_lengths"],
+                    patch_images=sample["net_input"]["patch_images"],
+                    patch_masks=sample["net_input"]["patch_masks"]
+                )
+                device = sample["net_input"]["src_tokens"].device
+                eos_item = torch.tensor([self.src_dict.eos()])
+                pad = self.src_dict.pad()
+                valid_result = []
+                for valid_answers, valid_constraint_masks in zip(self.valid_answers_list, self.valid_constraint_masks_list):
+                    valid_size = len(valid_answers)
+                    valid_tgt_items = [
+                        torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
+                        for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+                    ]
+                    valid_prev_items = [
+                        torch.cat([torch.tensor(decoder_prompt), valid_answer])
+                        for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+                    ]
+                    valid_constraint_mask_items = [
+                        torch.cat([torch.zeros(len(decoder_prompt)-1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask], dim=0)
+                        for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
+                    ]
+                    valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad, left_pad=False).to(device)
+                    valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad, left_pad=False).to(device)
+                    valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad, left_pad=False).to(device)
+                    new_encoder_out = {}
+                    new_encoder_out["encoder_out"] = [
+                        encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
+                    ]
+                    new_encoder_out["encoder_padding_mask"] = [
+                        encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
+                    ]
+                    new_encoder_out["position_embeddings"] = [
+                        encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
+                    ]
+                    decoder_out = eval_model.decoder(valid_prev_output, encoder_out=new_encoder_out)
+                    decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
+                    lprobs = eval_model.get_normalized_probs(decoder_out, log_probs=True)
+                    scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+                    scores = scores.masked_fill(valid_tgt.eq(self.tgt_dict.pad()), 0)
+                    scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
+                    scores = scores.sum(1)
+                    scores = scores.view(-1, valid_size)
+                    valid_result.append(scores)
+                valid_result = torch.cat(valid_result, dim=-1)
+                predicts = valid_result.argmax(1).tolist()
+                hyps = [self.index2ans[predict_index] for predict_index in predicts]
+            elif self.cfg.val_inference_type == "beamsearch":
+                raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
+                hyps = []
+                for i, sample_id in enumerate(sample["id"].tolist()):
+                    prefix_len = sample['prefix_tokens'][i].ne(1).sum().item()
+                    detok_hypo_str = decode_fn(raw_hyps[i][0]["tokens"][prefix_len:], self.tgt_dict, self.bpe, self.generator)
+                    hyps.append(detok_hypo_str.strip())
+            else:
+                raise NotImplementedError("Error: Unknown inference type encountered.")
+        scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
+        logging_output["_vqa_score_sum"] = sum(scores)
+        logging_output["_vqa_cnt"] = len(scores)
+        return loss, sample_size, logging_output
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        def sum_logs(key):
+            import torch
+            result = sum(log.get(key, 0) for log in logging_outputs)
+            if torch.is_tensor(result):
+                result = result.cpu()
+            return result
+        def compute_score(meters):
+            score = meters["_vqa_score_sum"].sum / meters["_vqa_cnt"].sum
+            score = score if isinstance(score, float) else score.item()
+            return round(score, 4)
+        if sum_logs("_vqa_cnt") > 0:
+            metrics.log_scalar("_vqa_score_sum", sum_logs("_vqa_score_sum"))
+            metrics.log_scalar("_vqa_cnt", sum_logs("_vqa_cnt"))
+            metrics.log_derived("vqa_score", compute_score)

tasks/ofa_task.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from dataclasses import dataclass, field
+import logging
+import os
+import math
+import torch
+from typing import Dict, Optional
+from fairseq import search
+from fairseq.data import FairseqDataset, iterators
+from fairseq.optim.amp_optimizer import AMPOptimizer
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+from omegaconf import DictConfig
+logger = logging.getLogger(__name__)
+@dataclass
+class OFAConfig(FairseqDataclass):
+    data: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "comma separated path to data list, will be iterated upon during epochs "
+                    "in round-robin manner; valid data are always in the last"
+        },
+    )
+    selected_cols: Optional[str] = field(
+        default=None,
+        metadata={"help": "selected cols"},
+    )
+    bpe_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "bpe dir"},
+    )
+    max_source_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the source sequence"}
+    )
+    max_target_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    max_src_length: int = field(
+        default=128, metadata={"help": "the maximum src sequence length"}
+    )
+    max_tgt_length: int = field(
+        default=30, metadata={"help": "the maximum target sequence length"}
+    )
+    code_dict_size: int = field(
+        default=8192, metadata={"help": "code dict size"}
+    )
+    patch_image_size: int = field(
+        default=480, metadata={"help": "patch image size"}
+    )
+    num_bins: int = field(
+        default=1000, metadata={"help": "number of quantization bins"}
+    )
+    imagenet_default_mean_and_std: bool = field(
+        default=False,
+        metadata={"help": "imagenet normalize"},
+    )
+    constraint_range: Optional[str] = field(
+        default=None,
+        metadata={"help": "constraint range"}
+    )
+@register_task("ofa", dataclass=OFAConfig)
+class OFATask(FairseqTask):
+    def __init__(self, cfg: OFAConfig, src_dict, tgt_dict):
+        super().__init__(cfg)
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+    @classmethod
+    def setup_task(cls, cfg: DictConfig, **kwargs):
+        """Setup the task."""
+        # load dictionaries
+        src_dict = cls.load_dictionary(
+            os.path.join(cfg.bpe_dir, "dict.txt")
+        )
+        tgt_dict = cls.load_dictionary(
+            os.path.join(cfg.bpe_dir, "dict.txt")
+        )
+        src_dict.add_symbol("<mask>")
+        tgt_dict.add_symbol("<mask>")
+        for i in range(cfg.code_dict_size):
+            src_dict.add_symbol("<code_{}>".format(i))
+            tgt_dict.add_symbol("<code_{}>".format(i))
+        # quantization
+        for i in range(cfg.num_bins):
+            src_dict.add_symbol("<bin_{}>".format(i))
+            tgt_dict.add_symbol("<bin_{}>".format(i))
+        logger.info("source dictionary: {} types".format(len(src_dict)))
+        logger.info("target dictionary: {} types".format(len(tgt_dict)))
+        return cls(cfg, src_dict, tgt_dict)
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+    ):
+        assert isinstance(dataset, FairseqDataset)
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+        # create mini-batches with given size constraints
+        batch_sampler = [
+            [j for j in range(i, min(i + max_sentences, len(dataset)))]
+            for i in range(0, len(dataset), max_sentences)
+        ]
+        total_row_count = dataset.dataset.get_total_row_count()
+        num_batches = math.ceil(math.ceil(total_row_count / num_shards) / max_sentences)
+        if len(batch_sampler) < num_batches:
+            batch_sampler.append([])
+        # return a reusable, sharded iterator
+        epoch_iter = iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_sampler=batch_sampler,
+            seed=seed,
+            num_shards=1,
+            shard_id=0,
+            num_workers=num_workers,
+            epoch=epoch,
+            buffer_size=data_buffer_size
+        )
+        return epoch_iter
+    def build_model(self, cfg: FairseqDataclass):
+        model = super().build_model(cfg)
+        bpe_dict = {
+            "_name": "gpt2",
+            "gpt2_encoder_json": os.path.join(self.cfg.bpe_dir, "encoder.json"),
+            "gpt2_vocab_bpe": os.path.join(self.cfg.bpe_dir, "vocab.bpe")
+        }
+        bpe_dict = DictConfig(bpe_dict)
+        self.bpe = self.build_bpe(bpe_dict)
+        return model
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
+    ):
+        """
+        Build a :class:`~fairseq.SequenceGenerator` instance for this
+        task.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            args (fairseq.dataclass.configs.GenerationConfig):
+                configuration object (dataclass) for generation
+            extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass
+                through to SequenceGenerator
+            prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]):
+                If provided, this function constrains the beam search to
+                allowed tokens only at each step. The provided function
+                should take 2 arguments: the batch ID (`batch_id: int`)
+                and a unidimensional tensor of token ids (`inputs_ids:
+                torch.Tensor`). It has to return a `List[int]` with the
+                allowed tokens for the next generation step conditioned
+                on the previously generated tokens (`inputs_ids`) and
+                the batch ID (`batch_id`). This argument is useful for
+                constrained generation conditioned on the prefix, as
+                described in "Autoregressive Entity Retrieval"
+                (https://arxiv.org/abs/2010.00904) and
+                https://github.com/facebookresearch/GENRE.
+        """
+        if getattr(args, "score_reference", False):
+            from fairseq.sequence_scorer import SequenceScorer
+            return SequenceScorer(
+                self.target_dictionary,
+                compute_alignment=getattr(args, "print_alignment", False),
+            )
+        from fairseq.sequence_generator import (
+            # SequenceGenerator,
+            SequenceGeneratorWithAlignment,
+        )
+        from models.sequence_generator import SequenceGenerator
+        # Choose search strategy. Defaults to Beam Search.
+        sampling = getattr(args, "sampling", False)
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        diverse_beam_groups = getattr(args, "diverse_beam_groups", -1)
+        diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5)
+        match_source_len = getattr(args, "match_source_len", False)
+        diversity_rate = getattr(args, "diversity_rate", -1)
+        constrained = getattr(args, "constraints", False)
+        if prefix_allowed_tokens_fn is None:
+            prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None)
+        if (
+            sum(
+                int(cond)
+                for cond in [
+                    sampling,
+                    diverse_beam_groups > 0,
+                    match_source_len,
+                    diversity_rate > 0,
+                ]
+            )
+            > 1
+        ):
+            raise ValueError("Provided Search parameters are mutually exclusive.")
+        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
+        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
+        if sampling:
+            search_strategy = search.Sampling(
+                self.target_dictionary, sampling_topk, sampling_topp
+            )
+        elif diverse_beam_groups > 0:
+            search_strategy = search.DiverseBeamSearch(
+                self.target_dictionary, diverse_beam_groups, diverse_beam_strength
+            )
+        elif match_source_len:
+            # this is useful for tagging applications where the output
+            # length should match the input length, so we hardcode the
+            # length constraints for simplicity
+            search_strategy = search.LengthConstrainedBeamSearch(
+                self.target_dictionary,
+                min_len_a=1,
+                min_len_b=0,
+                max_len_a=1,
+                max_len_b=0,
+            )
+        elif diversity_rate > -1:
+            search_strategy = search.DiverseSiblingsSearch(
+                self.target_dictionary, diversity_rate
+            )
+        elif constrained:
+            search_strategy = search.LexicallyConstrainedBeamSearch(
+                self.target_dictionary, args.constraints
+            )
+        elif prefix_allowed_tokens_fn:
+            search_strategy = search.PrefixConstrainedBeamSearch(
+                self.target_dictionary, prefix_allowed_tokens_fn
+            )
+        else:
+            search_strategy = search.BeamSearch(self.target_dictionary)
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+        if seq_gen_cls is None:
+            if getattr(args, "print_alignment", False):
+                seq_gen_cls = SequenceGeneratorWithAlignment
+                extra_gen_cls_kwargs["print_alignment"] = args.print_alignment
+            else:
+                seq_gen_cls = SequenceGenerator
+        return seq_gen_cls(
+            models,
+            self.target_dictionary,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            constraint_range=self.cfg.constraint_range,
+            **extra_gen_cls_kwargs,
+        )
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False, **extra_kwargs
+    ):
+        """
+        Do forward and backward, and return the loss as computed by *criterion*
+        for the given *model* and *sample*.
+        Args:
+            sample (dict): the mini-batch. The format is defined by the
+                :class:`~fairseq.data.FairseqDataset`.
+            model (~fairseq.models.BaseFairseqModel): the model
+            criterion (~fairseq.criterions.FairseqCriterion): the criterion
+            optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
+            update_num (int): the current update
+            ignore_grad (bool): multiply loss by 0 if this is set to True
+        Returns:
+            tuple:
+                - the loss
+                - the sample size, which is used as the denominator for the
+                  gradient
+                - logging outputs to display while training
+        """
+        model.train()
+        model.set_num_updates(update_num)
+        with torch.autograd.profiler.record_function("forward"):
+            with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))):
+                loss, sample_size, logging_output = criterion(model, sample, update_num=update_num)
+        if ignore_grad:
+            loss *= 0
+        with torch.autograd.profiler.record_function("backward"):
+            optimizer.backward(loss)
+        return loss, sample_size, logging_output
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.cfg.max_source_positions, self.cfg.max_target_positions)
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.src_dict
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.tgt_dict

utils/BPE/__init__.py ADDED Viewed

File without changes

utils/BPE/dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/BPE/encoder.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/BPE/vocab.bpe ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/__init__.py ADDED Viewed

File without changes

utils/checkpoint_utils.py ADDED Viewed

	@@ -0,0 +1,875 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import ast
+import collections
+import contextlib
+import logging
+import numpy as np
+import os
+import re
+import time
+import traceback
+import math
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Union
+import torch
+from fairseq.dataclass.configs import CheckpointConfig
+from fairseq.dataclass.utils import (
+    convert_namespace_to_omegaconf,
+    overwrite_args_by_name,
+)
+from fairseq.distributed.fully_sharded_data_parallel import FSDP, has_FSDP
+from fairseq.file_io import PathManager
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from omegaconf import DictConfig, open_dict, OmegaConf
+from data import data_utils
+logger = logging.getLogger(__name__)
+def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
+    from fairseq import meters
+    # only one worker should attempt to create the required dir
+    if trainer.data_parallel_rank == 0:
+        os.makedirs(cfg.save_dir, exist_ok=True)
+    prev_best = getattr(save_checkpoint, "best", val_loss)
+    if val_loss is not None:
+        best_function = max if cfg.maximize_best_checkpoint_metric else min
+        save_checkpoint.best = best_function(val_loss, prev_best)
+    if cfg.no_save:
+        return
+    trainer.consolidate_optimizer()  # TODO(SS): do we need this if no_save_optimizer_state
+    if not trainer.should_save_checkpoint_on_current_rank:
+        if trainer.always_call_state_dict_during_save_checkpoint:
+            trainer.state_dict()
+        return
+    write_timer = meters.StopwatchMeter()
+    write_timer.start()
+    epoch = epoch_itr.epoch
+    end_of_epoch = epoch_itr.end_of_epoch()
+    updates = trainer.get_num_updates()
+    logger.info(f"Preparing to save checkpoint for epoch {epoch} @ {updates} updates")
+    def is_better(a, b):
+        return a >= b if cfg.maximize_best_checkpoint_metric else a <= b
+    suffix = trainer.checkpoint_suffix
+    checkpoint_conds = collections.OrderedDict()
+    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
+        end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
+    )
+    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
+        not end_of_epoch
+        and cfg.save_interval_updates > 0
+        and updates % cfg.save_interval_updates == 0
+    )
+    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
+        not hasattr(save_checkpoint, "best")
+        or is_better(val_loss, save_checkpoint.best)
+    )
+    if val_loss is not None and cfg.keep_best_checkpoints > 0:
+        worst_best = getattr(save_checkpoint, "best", None)
+        chkpts = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if len(chkpts) > 0:
+            p = chkpts[-1] if cfg.maximize_best_checkpoint_metric else chkpts[0]
+            worst_best = float(p.rsplit("_")[-1].replace("{}.pt".format(suffix), ""))
+        # add random digits to resolve ties
+        with data_utils.numpy_seed(epoch, updates, val_loss):
+            rand_sfx = np.random.randint(0, cfg.keep_best_checkpoints)
+        checkpoint_conds[
+            "checkpoint.best_{}_{:.3f}{}{}.pt".format(
+                cfg.best_checkpoint_metric,
+                val_loss,
+                rand_sfx,
+                suffix
+            )
+        ] = worst_best is None or is_better(val_loss, worst_best)
+    checkpoint_conds[
+        "checkpoint_last{}.pt".format(suffix)
+    ] = not cfg.no_last_checkpoints
+    extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss}
+    if hasattr(save_checkpoint, "best"):
+        extra_state.update({"best": save_checkpoint.best})
+    checkpoints = [
+        os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
+    ]
+    if len(checkpoints) > 0:
+        trainer.save_checkpoint(checkpoints[0], extra_state)
+        for cp in checkpoints[1:]:
+            if cfg.write_checkpoints_asynchronously:
+                # TODO[ioPath]: Need to implement a delayed asynchronous
+                # file copying/moving feature.
+                logger.warning(
+                    f"ioPath is not copying {checkpoints[0]} to {cp} "
+                    "since async write mode is on."
+                )
+            else:
+                assert PathManager.copy(
+                    checkpoints[0], cp, overwrite=True
+                ), f"Failed to copy {checkpoints[0]} to {cp}"
+        write_timer.stop()
+        logger.info(
+            "Saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format(
+                checkpoints[0], epoch, updates, val_loss, write_timer.sum
+            )
+        )
+    if not end_of_epoch and cfg.keep_interval_updates > 0:
+        # remove old checkpoints; checkpoints are sorted in descending order
+        if cfg.keep_interval_updates_pattern == -1:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix)
+            )
+        else:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir,
+                pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix),
+                keep_match=True,
+            )
+            checkpoints = [
+                x[0]
+                for x in checkpoints
+                if x[1] % cfg.keep_interval_updates_pattern != 0
+            ]
+        for old_chk in checkpoints[cfg.keep_interval_updates :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+    if cfg.keep_last_epochs > 0:
+        # remove old epoch checkpoints; checkpoints are sorted in descending order
+        checkpoints = checkpoint_paths(
+            cfg.save_dir, pattern=r"checkpoint(\d+){}\.pt".format(suffix)
+        )
+        for old_chk in checkpoints[cfg.keep_last_epochs :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+    if cfg.keep_best_checkpoints > 0:
+        # only keep the best N checkpoints according to validation metric
+        checkpoints = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if not cfg.maximize_best_checkpoint_metric:
+            checkpoints = checkpoints[::-1]
+        for old_chk in checkpoints[cfg.keep_best_checkpoints :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
+    """
+    Load a checkpoint and restore the training iterator.
+    *passthrough_args* will be passed through to
+    ``trainer.get_train_iterator``.
+    """
+    reset_optimizer = cfg.reset_optimizer
+    reset_lr_scheduler = cfg.reset_lr_scheduler
+    optimizer_overrides = ast.literal_eval(cfg.optimizer_overrides)
+    reset_meters = cfg.reset_meters
+    reset_dataloader = cfg.reset_dataloader
+    if cfg.finetune_from_model is not None and (
+        reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
+    ):
+        raise ValueError(
+            "--finetune-from-model can not be set together with either --reset-optimizer"
+            " or reset_lr_scheduler or reset_meters or reset_dataloader"
+        )
+    suffix = trainer.checkpoint_suffix
+    if (
+        cfg.restore_file == "checkpoint_last.pt"
+    ):  # default value of restore_file is 'checkpoint_last.pt'
+        checkpoint_path = os.path.join(
+            cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
+        )
+        first_launch = not PathManager.exists(checkpoint_path)
+        if cfg.finetune_from_model is not None and first_launch:
+            # if there is no last checkpoint to restore, start the finetune from pretrained model
+            # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc.
+            if PathManager.exists(cfg.finetune_from_model):
+                checkpoint_path = cfg.finetune_from_model
+                reset_optimizer = True
+                reset_lr_scheduler = True
+                reset_meters = True
+                reset_dataloader = True
+                logger.info(
+                    f"loading pretrained model from {checkpoint_path}: "
+                    "optimizer, lr scheduler, meters, dataloader will be reset"
+                )
+            else:
+                raise ValueError(
+                    f"--funetune-from-model {cfg.finetune_from_model} does not exist"
+                )
+    elif suffix is not None:
+        checkpoint_path = cfg.restore_file.replace(".pt", suffix + ".pt")
+    else:
+        checkpoint_path = cfg.restore_file
+    if cfg.restore_file != "checkpoint_last.pt" and cfg.finetune_from_model:
+        raise ValueError(
+            "--finetune-from-model and --restore-file (non-default value) "
+            "can not be specified together: " + str(cfg)
+        )
+    extra_state = trainer.load_checkpoint(
+        checkpoint_path,
+        reset_optimizer,
+        reset_lr_scheduler,
+        optimizer_overrides,
+        reset_meters=reset_meters,
+    )
+    if (
+        extra_state is not None
+        and "best" in extra_state
+        and not reset_optimizer
+        and not reset_meters
+    ):
+        save_checkpoint.best = extra_state["best"]
+    if extra_state is not None and not reset_dataloader:
+        # restore iterator from checkpoint
+        itr_state = extra_state["train_iterator"]
+        epoch_itr = trainer.get_train_iterator(
+            epoch=itr_state["epoch"], load_dataset=True, **passthrough_args
+        )
+        epoch_itr.load_state_dict(itr_state)
+        _n = itr_state['iterations_in_epoch']
+        offset = sum(len(_) for _ in epoch_itr.batch_sampler[:_n])
+        epoch_itr.dataset.dataset._seek(offset=offset)
+        true_num = int(math.ceil(len(epoch_itr.dataset) / 8)) * 8
+        another_offset = ((epoch_itr.epoch - 1) * true_num + offset) // 8
+        if hasattr(epoch_itr.dataset, 'pure_text_dataset'):
+            text_offset = (2 * another_offset) % len(epoch_itr.dataset.pure_text_dataset)
+            epoch_itr.dataset.pure_text_dataset._seek(offset=text_offset)
+        if hasattr(epoch_itr.dataset, 'pure_image_dataset'):
+            image_offset = another_offset % len(epoch_itr.dataset.pure_image_dataset)
+            epoch_itr.dataset.pure_image_dataset._seek(offset=image_offset)
+        if hasattr(epoch_itr.dataset, 'detection_dataset'):
+            detection_offset = another_offset % len(epoch_itr.dataset.detection_dataset)
+            epoch_itr.dataset.detection_dataset._seek(offset=detection_offset)
+    else:
+        epoch_itr = trainer.get_train_iterator(
+            epoch=1, load_dataset=True, **passthrough_args
+        )
+    trainer.lr_step(epoch_itr.epoch)
+    return extra_state, epoch_itr
+def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):
+    """Loads a checkpoint to CPU (with upgrading for backward compatibility).
+    If doing single-GPU training or if the checkpoint is only being loaded by at
+    most one process on each node (current default behavior is for only rank 0
+    to read the checkpoint from disk), load_on_all_ranks should be False to
+    avoid errors from torch.distributed not having been initialized or
+    torch.distributed.barrier() hanging.
+    If all processes on each node may be loading the checkpoint
+    simultaneously, load_on_all_ranks should be set to True to avoid I/O
+    conflicts.
+    There's currently no support for > 1 but < all processes loading the
+    checkpoint on each node.
+    """
+    local_path = PathManager.get_local_path(path)
+    # The locally cached file returned by get_local_path() may be stale for
+    # remote files that are periodically updated/overwritten (ex:
+    # checkpoint_last.pt) - so we remove the local copy, sync across processes
+    # (if needed), and then download a fresh copy.
+    if local_path != path and PathManager.path_requires_pathmanager(path):
+        try:
+            os.remove(local_path)
+        except FileNotFoundError:
+            # With potentially multiple processes removing the same file, the
+            # file being missing is benign (missing_ok isn't available until
+            # Python 3.8).
+            pass
+        if load_on_all_ranks:
+            torch.distributed.barrier()
+        local_path = PathManager.get_local_path(path)
+    with open(local_path, "rb") as f:
+        state = torch.load(f, map_location=torch.device("cpu"))
+    if "args" in state and state["args"] is not None and arg_overrides is not None:
+        args = state["args"]
+        for arg_name, arg_val in arg_overrides.items():
+            setattr(args, arg_name, arg_val)
+    if "cfg" in state and state["cfg"] is not None:
+        # hack to be able to set Namespace in dict config. this should be removed when we update to newer
+        # omegaconf version that supports object flags, or when we migrate all existing models
+        from omegaconf import _utils
+        old_primitive = _utils.is_primitive_type
+        _utils.is_primitive_type = lambda _: True
+        state["cfg"] = OmegaConf.create(state["cfg"])
+        _utils.is_primitive_type = old_primitive
+        OmegaConf.set_struct(state["cfg"], True)
+        if arg_overrides is not None:
+            overwrite_args_by_name(state["cfg"], arg_overrides)
+    state = _upgrade_state_dict(state)
+    return state
+def load_model_ensemble(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    """Loads an ensemble of models.
+    Args:
+        filenames (List[str]): checkpoint files to load
+        arg_overrides (Dict[str,Any], optional): override model args that
+            were used during model training
+        task (fairseq.tasks.FairseqTask, optional): task to use for loading
+    """
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble, args, _task = load_model_ensemble_and_task(
+        filenames,
+        arg_overrides,
+        task,
+        strict,
+        suffix,
+        num_shards,
+        state,
+    )
+    return ensemble, args
+def get_maybe_sharded_checkpoint_filename(
+    filename: str, suffix: str, shard_idx: int, num_shards: int
+) -> str:
+    orig_filename = filename
+    filename = filename.replace(".pt", suffix + ".pt")
+    fsdp_filename = filename[:-3] + f"-shard{shard_idx}.pt"
+    model_parallel_filename = orig_filename[:-3] + f"_part{shard_idx}.pt"
+    if PathManager.exists(fsdp_filename):
+        return fsdp_filename
+    elif num_shards > 1:
+        return model_parallel_filename
+    else:
+        return filename
+def load_model_ensemble_and_task(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    assert state is None or len(filenames) == 1
+    from fairseq import tasks
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble = []
+    cfg = None
+    for filename in filenames:
+        orig_filename = filename
+        model_shard_state = {"shard_weights": [], "shard_metadata": []}
+        assert num_shards > 0
+        st = time.time()
+        for shard_idx in range(num_shards):
+            filename = get_maybe_sharded_checkpoint_filename(
+                orig_filename, suffix, shard_idx, num_shards
+            )
+            if not PathManager.exists(filename):
+                raise IOError("Model file not found: {}".format(filename))
+            if state is None:
+                state = load_checkpoint_to_cpu(filename, arg_overrides)
+            if "args" in state and state["args"] is not None:
+                cfg = convert_namespace_to_omegaconf(state["args"])
+            elif "cfg" in state and state["cfg"] is not None:
+                cfg = state["cfg"]
+            else:
+                raise RuntimeError(
+                    f"Neither args nor cfg exist in state keys = {state.keys()}"
+                )
+            if task is None:
+                task = tasks.setup_task(cfg.task)
+            if "task_state" in state:
+                task.load_state_dict(state["task_state"])
+            if "fsdp_metadata" in state and num_shards > 1:
+                model_shard_state["shard_weights"].append(state["model"])
+                model_shard_state["shard_metadata"].append(state["fsdp_metadata"])
+                # check FSDP import before the code goes too far
+                if not has_FSDP:
+                    raise ImportError(
+                        "Cannot find FullyShardedDataParallel. "
+                        "Please install fairscale with: pip install fairscale"
+                    )
+                if shard_idx == num_shards - 1:
+                    consolidated_model_state = FSDP.consolidate_shard_weights(
+                        shard_weights=model_shard_state["shard_weights"],
+                        shard_metadata=model_shard_state["shard_metadata"],
+                    )
+                    model = task.build_model(cfg.model)
+                    model.load_state_dict(
+                        consolidated_model_state, strict=strict, model_cfg=cfg.model
+                    )
+            else:
+                # model parallel checkpoint or unsharded checkpoint
+                model = task.build_model(cfg.model)
+                model.load_state_dict(
+                    state["model"], strict=strict, model_cfg=cfg.model
+                )
+            # reset state so it gets loaded for the next model in ensemble
+            state = None
+            if shard_idx % 10 == 0 and shard_idx > 0:
+                elapsed = time.time() - st
+                logger.info(
+                    f"Loaded {shard_idx} shards in {elapsed:.2f}s, {elapsed / (shard_idx+1):.2f}s/shard"
+                )
+        # build model for ensemble
+        ensemble.append(model)
+    return ensemble, cfg, task
+def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt", keep_match=False):
+    """Retrieves all checkpoints found in `path` directory.
+    Checkpoints are identified by matching filename to the specified pattern. If
+    the pattern contains groups, the result will be sorted by the first group in
+    descending order.
+    """
+    pt_regexp = re.compile(pattern)
+    files = PathManager.ls(path)
+    entries = []
+    for i, f in enumerate(files):
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            idx = float(m.group(1)) if len(m.groups()) > 0 else i
+            entries.append((idx, m.group(0)))
+    if keep_match:
+        return [(os.path.join(path, x[1]), x[0]) for x in sorted(entries, reverse=True)]
+    else:
+        return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
+def torch_persistent_save(obj, filename, async_write: bool = False):
+    if async_write:
+        with PathManager.opena(filename, "wb") as f:
+            _torch_persistent_save(obj, f)
+    else:
+        with PathManager.open(filename, "wb") as f:
+            _torch_persistent_save(obj, f)
+        # if PathManager.supports_rename(filename):
+        #     # do atomic save
+        #     with PathManager.open(filename + ".tmp", "wb") as f:
+        #         _torch_persistent_save(obj, f)
+        #     PathManager.rename(filename + ".tmp", filename)
+        # else:
+        #     # fallback to non-atomic save
+        #     with PathManager.open(filename, "wb") as f:
+        #         _torch_persistent_save(obj, f)
+def _torch_persistent_save(obj, f):
+    if isinstance(f, str):
+        with PathManager.open(f, "wb") as h:
+            torch_persistent_save(obj, h)
+        return
+    for i in range(3):
+        try:
+            return torch.save(obj, f)
+        except Exception:
+            if i == 2:
+                logger.error(traceback.format_exc())
+                raise
+def _upgrade_state_dict(state):
+    """Helper for upgrading old model checkpoints."""
+    # add optimizer_history
+    if "optimizer_history" not in state:
+        state["optimizer_history"] = [
+            {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]}
+        ]
+        state["last_optimizer_state"] = state["optimizer"]
+        del state["optimizer"]
+        del state["best_loss"]
+    # move extra_state into sub-dictionary
+    if "epoch" in state and "extra_state" not in state:
+        state["extra_state"] = {
+            "epoch": state["epoch"],
+            "batch_offset": state["batch_offset"],
+            "val_loss": state["val_loss"],
+        }
+        del state["epoch"]
+        del state["batch_offset"]
+        del state["val_loss"]
+    # reduce optimizer history's memory usage (only keep the last state)
+    if "optimizer" in state["optimizer_history"][-1]:
+        state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"]
+        for optim_hist in state["optimizer_history"]:
+            del optim_hist["optimizer"]
+    # record the optimizer class name
+    if "optimizer_name" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG"
+    # move best_loss into lr_scheduler_state
+    if "lr_scheduler_state" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["lr_scheduler_state"] = {
+            "best": state["optimizer_history"][-1]["best_loss"]
+        }
+        del state["optimizer_history"][-1]["best_loss"]
+    # keep track of number of updates
+    if "num_updates" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["num_updates"] = 0
+    # old model checkpoints may not have separate source/target positions
+    if (
+        "args" in state
+        and hasattr(state["args"], "max_positions")
+        and not hasattr(state["args"], "max_source_positions")
+    ):
+        state["args"].max_source_positions = state["args"].max_positions
+        state["args"].max_target_positions = state["args"].max_positions
+    # use stateful training data iterator
+    if "train_iterator" not in state["extra_state"]:
+        state["extra_state"]["train_iterator"] = {
+            "epoch": state["extra_state"]["epoch"],
+            "iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
+        }
+    # backward compatibility, cfg updates
+    if "args" in state and state["args"] is not None:
+        # default to translation task
+        if not hasattr(state["args"], "task"):
+            state["args"].task = "translation"
+        # --raw-text and --lazy-load are deprecated
+        if getattr(state["args"], "raw_text", False):
+            state["args"].dataset_impl = "raw"
+        elif getattr(state["args"], "lazy_load", False):
+            state["args"].dataset_impl = "lazy"
+        # epochs start at 1
+        if state["extra_state"]["train_iterator"] is not None:
+            state["extra_state"]["train_iterator"]["epoch"] = max(
+                state["extra_state"]["train_iterator"].get("epoch", 1), 1
+            )
+        # --remove-bpe ==> --postprocess
+        if hasattr(state["args"], "remove_bpe"):
+            state["args"].post_process = state["args"].remove_bpe
+        # --min-lr ==> --stop-min-lr
+        if hasattr(state["args"], "min_lr"):
+            state["args"].stop_min_lr = state["args"].min_lr
+            del state["args"].min_lr
+        # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion
+        if (
+            hasattr(state["args"], "criterion")
+            and state["args"].criterion in [
+                "binary_cross_entropy",
+                "kd_binary_cross_entropy",
+            ]
+        ):
+            state["args"].criterion = "wav2vec"
+        # remove log_keys if it's None (criteria will supply a default value of [])
+        if hasattr(state["args"], "log_keys") and state["args"].log_keys is None:
+            delattr(state["args"], "log_keys")
+        # speech_pretraining => audio pretraining
+        if (
+            hasattr(state["args"], "task")
+            and state["args"].task == "speech_pretraining"
+        ):
+            state["args"].task = "audio_pretraining"
+        # audio_cpc => wav2vec
+        if hasattr(state["args"], "arch") and state["args"].arch == "audio_cpc":
+            state["args"].arch = "wav2vec"
+        # convert legacy float learning rate to List[float]
+        if hasattr(state["args"], "lr") and isinstance(state["args"].lr, float):
+            state["args"].lr = [state["args"].lr]
+        # convert task data arg to a string instead of List[string]
+        if (
+            hasattr(state["args"], "data")
+            and isinstance(state["args"].data, list)
+            and len(state["args"].data) > 0
+        ):
+            state["args"].data = state["args"].data[0]
+        # remove keys in state["args"] related to teacher-student learning
+        for key in [
+            "static_teachers",
+            "static_teacher_weights",
+            "dynamic_teachers",
+            "dynamic_teacher_weights",
+        ]:
+            if key in state["args"]:
+                delattr(state["args"], key)
+        state["cfg"] = convert_namespace_to_omegaconf(state["args"])
+    if "cfg" in state and state["cfg"] is not None:
+        cfg = state["cfg"]
+        with open_dict(cfg):
+            # any upgrades for Hydra-based configs
+            if (
+                "task" in cfg
+                and "eval_wer_config" in cfg.task
+                and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
+            ):
+                cfg.task.eval_wer_config.print_alignment = "hard"
+            if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
+                cfg.generation.print_alignment = "hard" if cfg.generation.print_alignment else None
+            if (
+                "model" in cfg
+                and "w2v_args" in cfg.model
+                and cfg.model.w2v_args is not None
+                and (
+                    hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
+                )
+                and hasattr(cfg.model.w2v_args.task, "eval_wer_config")
+                and cfg.model.w2v_args.task.eval_wer_config is not None
+                and isinstance(
+                    cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
+                )
+            ):
+                cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"
+    return state
+def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
+    """Prune the given state_dict if desired for LayerDrop
+    (https://arxiv.org/abs/1909.11556).
+    Training with LayerDrop allows models to be robust to pruning at inference
+    time. This function prunes state_dict to allow smaller models to be loaded
+    from a larger model and re-maps the existing state_dict for this to occur.
+    It's called by functions that load models from checkpoints and does not
+    need to be called directly.
+    """
+    arch = None
+    if model_cfg is not None:
+        arch = (
+            model_cfg._name
+            if isinstance(model_cfg, DictConfig)
+            else getattr(model_cfg, "arch", None)
+        )
+    if not model_cfg or arch is None or arch == "ptt_transformer":
+        # args should not be none, but don't crash if it is.
+        return state_dict
+    encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None)
+    decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None)
+    if not encoder_layers_to_keep and not decoder_layers_to_keep:
+        return state_dict
+    # apply pruning
+    logger.info(
+        "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop"
+    )
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted(
+            int(layer_string) for layer_string in layers_to_keep.split(",")
+        )
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+        regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
+        return {"substitution_regex": regex, "mapping_dict": mapping_dict}
+    pruning_passes = []
+    if encoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+    new_state_dict = {}
+    for layer_name in state_dict.keys():
+        match = re.search(r"\.layers\.(\d+)\.", layer_name)
+        # if layer has no number in it, it is a supporting layer, such as an
+        # embedding
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+        # otherwise, layer should be pruned.
+        original_layer_number = match.group(1)
+        # figure out which mapping dict to replace from
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[
+                "substitution_regex"
+            ].search(layer_name):
+                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
+                substitution_match = pruning_pass["substitution_regex"].search(
+                    layer_name
+                )
+                new_state_key = (
+                    layer_name[: substitution_match.start(1)]
+                    + new_layer_number
+                    + layer_name[substitution_match.end(1) :]
+                )
+                new_state_dict[new_state_key] = state_dict[layer_name]
+    # Since layers are now pruned, *_layers_to_keep are no longer needed.
+    # This is more of "It would make it work fix" rather than a proper fix.
+    if isinstance(model_cfg, DictConfig):
+        context = open_dict(model_cfg)
+    else:
+        context = contextlib.ExitStack()
+    with context:
+        if hasattr(model_cfg, "encoder_layers_to_keep"):
+            model_cfg.encoder_layers_to_keep = None
+        if hasattr(model_cfg, "decoder_layers_to_keep"):
+            model_cfg.decoder_layers_to_keep = None
+    return new_state_dict
+def load_pretrained_component_from_model(
+    component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str
+):
+    """
+    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
+    provided `component` object. If state_dict fails to load, there may be a
+    mismatch in the architecture of the corresponding `component` found in the
+    `checkpoint` file.
+    """
+    if not PathManager.exists(checkpoint):
+        raise IOError("Model file not found: {}".format(checkpoint))
+    state = load_checkpoint_to_cpu(checkpoint)
+    if isinstance(component, FairseqEncoder):
+        component_type = "encoder"
+    elif isinstance(component, FairseqDecoder):
+        component_type = "decoder"
+    else:
+        raise ValueError(
+            "component to load must be either a FairseqEncoder or "
+            "FairseqDecoder. Loading other component types are not supported."
+        )
+    component_state_dict = OrderedDict()
+    for key in state["model"].keys():
+        if key.startswith(component_type):
+            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
+            component_subkey = key[len(component_type) + 1 :]
+            component_state_dict[component_subkey] = state["model"][key]
+    component.load_state_dict(component_state_dict, strict=True)
+    return component
+def verify_checkpoint_directory(save_dir: str) -> None:
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    temp_file_path = os.path.join(save_dir, "dummy")
+    try:
+        with open(temp_file_path, "w"):
+            pass
+    except OSError as e:
+        logger.warning(
+            "Unable to access checkpoint save directory: {}".format(save_dir)
+        )
+        raise e
+    else:
+        os.remove(temp_file_path)
+def load_ema_from_checkpoint(fpath):
+    """Loads exponential moving averaged (EMA) checkpoint from input and
+    returns a model with ema weights.
+    Args:
+      fpath: A string path of checkpoint to load from.
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    new_state = None
+    with PathManager.open(fpath, 'rb') as f:
+        new_state = torch.load(
+            f,
+            map_location=(
+                lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
+            ),
+        )
+        # EMA model is stored in a separate "extra state"
+        model_params = new_state['extra_state']['ema']
+        for key in list(model_params.keys()):
+            p = model_params[key]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if key not in params_dict:
+                params_dict[key] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                raise ValueError("Key {} is repeated in EMA model params.".format(key))
+        if len(params_dict) == 0:
+            raise ValueError(
+                f"Input checkpoint path '{fpath}' does not contain "
+                "ema model weights, is this model trained with EMA?"
+            )
+    new_state['model'] = params_dict
+    return new_state

utils/cider/pyciderevalcap/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

utils/cider/pyciderevalcap/cider/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

utils/cider/pyciderevalcap/cider/cider.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Filename: cider.py
+#
+#
+# Description: Describes the class to compute the CIDEr
+# (Consensus-Based Image Description Evaluation) Metric
+#          by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
+#
+# Creation Date: Sun Feb  8 14:16:54 2015
+#
+# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and
+# Tsung-Yi Lin <tl483@cornell.edu>
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .cider_scorer import CiderScorer
+class Cider:
+    """
+    Main Class to compute the CIDEr metric
+    """
+    def __init__(self, n=4, df="corpus"):
+        """
+        Initialize the CIDEr scoring function
+        : param n (int): n-gram size
+        : param df (string): specifies where to get the IDF values from
+                    takes values 'corpus', 'coco-train'
+        : return: None
+        """
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        self._df = df
+        self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
+    def compute_score(self, gts, res):
+        """
+        Main function to compute CIDEr score
+        : param  gts (dict) : {image:tokenized reference sentence}
+        : param res (dict)  : {image:tokenized candidate sentence}
+        : return: cider (float) : computed CIDEr score for the corpus
+        """
+        # clear all the previous hypos and refs
+        self.cider_scorer.clear()
+        for res_id in res:
+            hypo = res_id['caption']
+            ref = gts[res_id['image_id']]
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) > 0)
+            self.cider_scorer += (hypo[0], ref)
+        (score, scores) = self.cider_scorer.compute_score()
+        return score, scores
+    def method(self):
+        return "CIDEr"

utils/cider/pyciderevalcap/cider/cider_scorer.py ADDED Viewed

	@@ -0,0 +1,207 @@

+#!/usr/bin/env python
+# Tsung-Yi Lin <tl483@cornell.edu>
+# Ramakrishna Vedantam <vrama91@vt.edu>
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import six
+from six.moves import cPickle
+from collections import defaultdict
+import numpy as np
+import math
+import os
+def precook(s, n=4, out=False):
+    """
+    Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well.
+    :param s: string : sentence to be converted into ngrams
+    :param n: int    : number of ngrams for which representation is calculated
+    :return: term frequency vector for occuring ngrams
+    """
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return counts
+def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.
+    :param refs: list of string : reference sentences for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (list of dict)
+    '''
+    return [precook(ref, n) for ref in refs]
+def cook_test(test, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.
+    :param test: list of string : hypothesis sentence for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (dict)
+    '''
+    return precook(test, n, True)
+class CiderScorer(object):
+    """CIDEr scorer.
+    """
+    def copy(self):
+        ''' copy the refs.'''
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+    def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
+        ''' singular instance '''
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.df_mode = df_mode
+        self.ref_len = None
+        if self.df_mode != "corpus":
+            pkl_file = cPickle.load(open(os.path.join('data', df_mode + '.p'),'rb'), **(dict(encoding='latin1') if six.PY3 else {}))
+            self.ref_len = np.log(float(pkl_file['ref_len']))
+            self.document_frequency = pkl_file['document_frequency']
+        self.cook_append(test, refs)
+    def clear(self):
+        self.crefs = []
+        self.ctest = []
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test)) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+        if type(other) is tuple:
+            ## avoid creating new CiderScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+        return self
+    def compute_doc_freq(self):
+        '''
+        Compute term frequency for reference data.
+        This will be used to compute idf (inverse document frequency later)
+        The term frequency is stored in the object
+        :return: None
+        '''
+        for refs in self.crefs:
+            # refs, k ref captions of one image
+            for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
+                self.document_frequency[ngram] += 1
+            # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    def compute_cider(self):
+        def counts2vec(cnts):
+            """
+            Function maps counts of ngram to vector of tfidf weights.
+            The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
+            The n-th entry of array denotes length of n-grams.
+            :param cnts:
+            :return: vec (array of dict), norm (array of float), length (int)
+            """
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram,term_freq) in cnts.items():
+                # give word count 1 if it doesn't appear in reference corpus
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                # ngram index
+                n = len(ngram)-1
+                # tf (term_freq) * idf (precomputed idf) for n-grams
+                vec[n][ngram] = float(term_freq)*(self.ref_len - df)
+                # compute norm for the vector.  the norm will be used for
+                # computing similarity
+                norm[n] += pow(vec[n][ngram], 2)
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            '''
+            Compute the cosine similarity of two vectors.
+            :param vec_hyp: array of dictionary for vector corresponding to hypothesis
+            :param vec_ref: array of dictionary for vector corresponding to reference
+            :param norm_hyp: array of float for vector corresponding to hypothesis
+            :param norm_ref: array of float for vector corresponding to reference
+            :param length_hyp: int containing length of hypothesis
+            :param length_ref: int containing length of reference
+            :return: array of score for each n-grams cosine similarity
+            '''
+            delta = float(length_hyp - length_ref)
+            # measure consine similarity
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                # ngram
+                for (ngram,count) in vec_hyp[n].items():
+                    val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram]
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n]*norm_ref[n])
+                assert(not math.isnan(val[n]))
+            return val
+        # compute log reference length
+        if self.df_mode == "corpus":
+            self.ref_len = np.log(float(len(self.crefs)))
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            # compute vector for test captions
+            vec, norm, length = counts2vec(test)
+            # compute vector for ref captions
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            # change by vrama91 - mean of ngram scores, instead of sum
+            score_avg = np.mean(score)
+            # divide by number of references
+            score_avg /= len(refs)
+            # multiply score by 10
+            score_avg *= 10.0
+            # append score of an image to the score list
+            scores.append(score_avg)
+        return scores
+    def compute_score(self, option=None, verbose=0):
+        # compute idf
+        if self.df_mode == "corpus":
+            self.document_frequency = defaultdict(float)
+            self.compute_doc_freq()
+            # assert to check document frequency
+            assert(len(self.ctest) >= max(self.document_frequency.values()))
+            # import json for now and write the corresponding files
+        # compute cider score
+        score = self.compute_cider()
+        # debug
+        # print score
+        return np.mean(np.array(score)), np.array(score)

utils/cider/pyciderevalcap/ciderD/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

utils/cider/pyciderevalcap/ciderD/ciderD.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Filename: ciderD.py
+#
+# Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
+#               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
+#
+# Creation Date: Sun Feb  8 14:16:54 2015
+#
+# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .ciderD_scorer import CiderScorer
+import pdb
+class CiderD:
+    """
+    Main Class to compute the CIDEr metric
+    """
+    def __init__(self, n=4, sigma=6.0, df="corpus"):
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        # set the standard deviation parameter for gaussian penalty
+        self._sigma = sigma
+        # set which where to compute document frequencies from
+        self._df = df
+        self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
+    def compute_score(self, gts, res):
+        """
+        Main function to compute CIDEr score
+        :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
+                ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
+        :return: cider (float) : computed CIDEr score for the corpus
+        """
+        # clear all the previous hypos and refs
+        tmp_cider_scorer = self.cider_scorer.copy_empty()
+        tmp_cider_scorer.clear()
+        for res_id in res:
+            hypo = res_id['caption']
+            ref = gts[res_id['image_id']]
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) > 0)
+            tmp_cider_scorer += (hypo[0], ref)
+        (score, scores) = tmp_cider_scorer.compute_score()
+        return score, scores
+    def method(self):
+        return "CIDEr-D"

utils/cider/pyciderevalcap/ciderD/ciderD_scorer.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python
+# Tsung-Yi Lin <tl483@cornell.edu>
+# Ramakrishna Vedantam <vrama91@vt.edu>
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+from collections import defaultdict
+import numpy as np
+import pdb
+import math
+import six
+from six.moves import cPickle
+import os
+def precook(s, n=4, out=False):
+    """
+    Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well.
+    :param s: string : sentence to be converted into ngrams
+    :param n: int    : number of ngrams for which representation is calculated
+    :return: term frequency vector for occuring ngrams
+    """
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return counts
+def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.
+    :param refs: list of string : reference sentences for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (list of dict)
+    '''
+    return [precook(ref, n) for ref in refs]
+def cook_test(test, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.
+    :param test: list of string : hypothesis sentence for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (dict)
+    '''
+    return precook(test, n, True)
+class CiderScorer(object):
+    """CIDEr scorer.
+    """
+    def copy(self):
+        ''' copy the refs.'''
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+    def copy_empty(self):
+        new = CiderScorer(df_mode="corpus", n=self.n, sigma=self.sigma)
+        new.df_mode = self.df_mode
+        new.ref_len = self.ref_len
+        new.document_frequency = self.document_frequency
+        return new
+    def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
+        ''' singular instance '''
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.df_mode = df_mode
+        self.ref_len = None
+        if self.df_mode != "corpus":
+            pkl_file = cPickle.load(open(df_mode,'rb'), **(dict(encoding='latin1') if six.PY3 else {}))
+            self.ref_len = np.log(float(pkl_file['ref_len']))
+            self.document_frequency = pkl_file['document_frequency']
+        else:
+            self.document_frequency = None
+        self.cook_append(test, refs)
+    def clear(self):
+        self.crefs = []
+        self.ctest = []
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test)) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+        if type(other) is tuple:
+            ## avoid creating new CiderScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+        return self
+    def compute_doc_freq(self):
+        '''
+        Compute term frequency for reference data.
+        This will be used to compute idf (inverse document frequency later)
+        The term frequency is stored in the object
+        :return: None
+        '''
+        for refs in self.crefs:
+            # refs, k ref captions of one image
+            for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
+                self.document_frequency[ngram] += 1
+            # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    def compute_cider(self):
+        def counts2vec(cnts):
+            """
+            Function maps counts of ngram to vector of tfidf weights.
+            The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
+            The n-th entry of array denotes length of n-grams.
+            :param cnts:
+            :return: vec (array of dict), norm (array of float), length (int)
+            """
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram,term_freq) in cnts.items():
+                # give word count 1 if it doesn't appear in reference corpus
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                # ngram index
+                n = len(ngram)-1
+                # tf (term_freq) * idf (precomputed idf) for n-grams
+                vec[n][ngram] = float(term_freq)*(self.ref_len - df)
+                # compute norm for the vector.  the norm will be used for computing similarity
+                norm[n] += pow(vec[n][ngram], 2)
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            '''
+            Compute the cosine similarity of two vectors.
+            :param vec_hyp: array of dictionary for vector corresponding to hypothesis
+            :param vec_ref: array of dictionary for vector corresponding to reference
+            :param norm_hyp: array of float for vector corresponding to hypothesis
+            :param norm_ref: array of float for vector corresponding to reference
+            :param length_hyp: int containing length of hypothesis
+            :param length_ref: int containing length of reference
+            :return: array of score for each n-grams cosine similarity
+            '''
+            delta = float(length_hyp - length_ref)
+            # measure consine similarity
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                # ngram
+                for (ngram,count) in vec_hyp[n].items():
+                    # vrama91 : added clipping
+                    val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n]*norm_ref[n])
+                assert(not math.isnan(val[n]))
+                # vrama91: added a length based gaussian penalty
+                val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
+            return val
+        # compute log reference length
+        if self.df_mode == "corpus":
+            self.ref_len = np.log(float(len(self.crefs)))
+        #elif self.df_mode == "coco-val-df":
+            # if coco option selected, use length of coco-val set
+        #    self.ref_len = np.log(float(40504))
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            # compute vector for test captions
+            vec, norm, length = counts2vec(test)
+            # compute vector for ref captions
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            # change by vrama91 - mean of ngram scores, instead of sum
+            score_avg = np.mean(score)
+            # divide by number of references
+            score_avg /= len(refs)
+            # multiply score by 10
+            score_avg *= 10.0
+            # append score of an image to the score list
+            scores.append(score_avg)
+        return scores
+    def compute_score(self, option=None, verbose=0):
+        # compute idf
+        if self.df_mode == "corpus":
+            self.document_frequency = defaultdict(float)
+            self.compute_doc_freq()
+            # assert to check document frequency
+            assert(len(self.ctest) >= max(self.document_frequency.values()))
+            # import json for now and write the corresponding files
+        # compute cider score
+        score = self.compute_cider()
+        # debug
+        # print score
+        return np.mean(np.array(score)), np.array(score)

utils/eval_utils.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import string
+import math
+import json
+from itertools import chain
+import os
+import torch
+import torch.distributed as dist
+from data import data_utils
+from tasks.nlg_tasks.gigaword import fix_tokenization
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.bos, generator.eos}
+def decode_fn(x, tgt_dict, bpe, generator, tokenizer=None):
+    x = tgt_dict.string(x.int().cpu(), extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator))
+    if bpe is not None:
+        x = bpe.decode(x)
+    if tokenizer is not None:
+        x = tokenizer.decode(x)
+    return x
+def eval_caption(task, generator, models, sample, **kwargs):
+    transtab = str.maketrans({key: None for key in string.punctuation})
+    hypos = task.inference_step(generator, models, sample)
+    results = []
+    for i, sample_id in enumerate(sample["id"].tolist()):
+        detok_hypo_str = decode_fn(hypos[i][0]["tokens"], task.tgt_dict, task.bpe, generator)
+        results.append({"image_id": str(sample_id), "caption": detok_hypo_str.translate(transtab).strip()})
+    return results, None
+def eval_vqa_gen(task, generator, models, sample, **kwargs):
+    if kwargs['beam_search_vqa_eval']:
+        hypos = task.inference_step(generator, models, sample, prefix_tokens=sample['prefix_tokens'])
+        results = []
+        for i, sample_id in enumerate(sample["id"].tolist()):
+            prefix_len = sample['prefix_tokens'][i].ne(1).sum().item()
+            detok_hypo_str = decode_fn(hypos[i][0]["tokens"][prefix_len:], task.tgt_dict, task.bpe, generator)
+            results.append({"question_id": int(sample_id), "answer": detok_hypo_str.strip()})
+        scores = [ref_dict.get(result['answer'], 0) for ref_dict, result in zip(sample['ref_dict'], results)]
+        return results, scores
+    encoder_out = models[0].encoder(
+        sample["net_input"]["src_tokens"],
+        src_lengths=sample["net_input"]["src_lengths"],
+        patch_images=sample["net_input"]["patch_images"],
+        patch_masks=sample["net_input"]["patch_masks"]
+    )
+    device = sample["net_input"]["src_tokens"].device
+    eos_item = torch.tensor([task.src_dict.eos()])
+    pad = task.src_dict.pad()
+    valid_result = []
+    for valid_answers, valid_constraint_masks in zip(task.valid_answers_list, task.valid_constraint_masks_list):
+        valid_size = len(valid_answers)
+        valid_tgt_items = [
+            torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
+            for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+        ]
+        valid_prev_items = [
+            torch.cat([torch.tensor(decoder_prompt), valid_answer])
+            for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+        ]
+        valid_constraint_mask_items = [
+            torch.cat(
+                [torch.zeros(len(decoder_prompt) - 1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask],
+                dim=0
+            )
+            for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
+        ]
+        valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad).to(device)
+        valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad).to(device)
+        valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad).to(device)
+        new_encoder_out = {}
+        new_encoder_out["encoder_out"] = [
+            encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
+        ]
+        new_encoder_out["encoder_padding_mask"] = [
+            encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
+        ]
+        new_encoder_out["position_embeddings"] = [
+            encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
+        ]
+        decoder_out = models[0].decoder(valid_prev_output, encoder_out=new_encoder_out)
+        decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
+        lprobs = models[0].get_normalized_probs(decoder_out, log_probs=True)
+        scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+        scores = scores.masked_fill(valid_tgt.eq(task.tgt_dict.pad()), 0)
+        scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
+        scores = scores.sum(1)
+        scores = scores.view(-1, valid_size)
+        valid_result.append(scores)
+    valid_result = torch.cat(valid_result, dim=-1)
+    predicts = valid_result.argmax(1).tolist()
+    hyps = [task.index2ans[predict_index] for predict_index in predicts]
+    results = [{"question_id": int(id), "answer": hyp} for id, hyp in zip(sample["id"].tolist(), hyps)]
+    scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
+    return results, scores
+def eval_refcoco(task, generator, models, sample, **kwargs):
+    def _calculate_ap_score(hyps, refs, thresh=0.5):
+        interacts = torch.cat(
+            [torch.where(hyps[:, :2] < refs[:, :2], refs[:, :2], hyps[:, :2]),
+             torch.where(hyps[:, 2:] < refs[:, 2:], hyps[:, 2:], refs[:, 2:])],
+            dim=1
+        )
+        area_predictions = (hyps[:, 2] - hyps[:, 0]) * (hyps[:, 3] - hyps[:, 1])
+        area_targets = (refs[:, 2] - refs[:, 0]) * (refs[:, 3] - refs[:, 1])
+        interacts_w = interacts[:, 2] - interacts[:, 0]
+        interacts_h = interacts[:, 3] - interacts[:, 1]
+        area_interacts = interacts_w * interacts_h
+        ious = area_interacts / (area_predictions + area_targets - area_interacts + 1e-6)
+        return ((ious >= thresh) & (interacts_w > 0) & (interacts_h > 0)).float()
+    gen_out = task.inference_step(generator, models, sample)
+    hyps = []
+    for i in range(len(gen_out)):
+        hyps.append(gen_out[i][0]["tokens"][:-1] - len(task.src_dict) + task.cfg.num_bins)
+    hyps = torch.stack(hyps, dim=0)
+    hyps = hyps / (task.cfg.num_bins - 1) * task.cfg.max_image_size
+    hyps[:, ::2] /= sample['w_resize_ratios'].unsqueeze(1)
+    hyps[:, 1::2] /= sample['h_resize_ratios'].unsqueeze(1)
+    results = [
+        {"uniq_id": sample_id,
+         "box": [hyps[i][0].item(), hyps[i][1].item(), hyps[i][2].item(), hyps[i][3].item()]}
+        for i, sample_id in enumerate(sample["id"].tolist())
+    ]
+    scores = _calculate_ap_score(hyps, sample['region_coords'].float())
+    return results, scores
+def eval_snli_ve(task, generator, models, sample, **kwargs):
+    encoder_out = models[0].encoder(
+        sample["net_input"]["src_tokens"],
+        src_lengths=sample["net_input"]["src_lengths"],
+        patch_images=sample["net_input"]["patch_images"],
+        patch_masks=sample["net_input"]["patch_masks"]
+    )
+    device = sample["net_input"]["src_tokens"].device
+    eos_item = torch.tensor([task.src_dict.eos()])
+    pad = task.src_dict.pad()
+    valid_result = []
+    for valid_answers, valid_constraint_masks in zip(task.valid_answers_list, task.valid_constraint_masks_list):
+        valid_size = len(valid_answers)
+        valid_tgt_items = [
+            torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
+            for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+        ]
+        valid_prev_items = [
+            torch.cat([torch.tensor(decoder_prompt), valid_answer])
+            for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
+        ]
+        valid_constraint_mask_items = [
+            torch.cat(
+                [torch.zeros(len(decoder_prompt) - 1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask],
+                dim=0
+            )
+            for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
+        ]
+        valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad).to(device)
+        valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad).to(device)
+        valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad).to(device)
+        new_encoder_out = {}
+        new_encoder_out["encoder_out"] = [
+            encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
+        ]
+        new_encoder_out["encoder_padding_mask"] = [
+            encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
+        ]
+        new_encoder_out["position_embeddings"] = [
+            encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
+        ]
+        decoder_out = models[0].decoder(valid_prev_output, encoder_out=new_encoder_out)
+        decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
+        lprobs = models[0].get_normalized_probs(decoder_out, log_probs=True)
+        scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+        scores = scores.masked_fill(valid_tgt.eq(task.tgt_dict.pad()), 0)
+        scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
+        scores = scores.sum(1)
+        scores = scores.view(-1, valid_size)
+        valid_result.append(scores)
+    valid_result = torch.cat(valid_result, dim=-1)
+    predicts = valid_result.argmax(1).tolist()
+    hyps = [task.index2ans[predict_index] for predict_index in predicts]
+    results = [{"uniq_id": id, "answer": hyp} for id, hyp in zip(sample["id"].tolist(), hyps)]
+    scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
+    return results, scores
+def eval_image_gen(task, generator, models, sample, **kwargs):
+    hypos, _ = task.inference_image(generator, sample, models)
+    tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
+    caption = task.bpe.decode(task.tgt_dict.string([token for token in tokens if token >= 4]))[
+              38:].replace('/', '')
+    text_similarity_score, indices = task.compute_text_similarity(hypos, caption,
+                                                                  sample['net_input']['src_tokens'].device)
+    results = []
+    for i, indice in enumerate(indices):
+        results.append({"sample_id": str(sample["id"][0]), "score": text_similarity_score[i], "image": hypos[indice]})
+    scores = [max(text_similarity_score).item()]
+    sorted_hyps = [hypos[indice] for indice in indices]
+    # dump results
+    if task.cfg.gen_images_path:
+        caption_tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
+        caption = task.bpe.decode(task.tgt_dict.string([token for token in caption_tokens if token >= 4]))[
+                  38:].replace('/', '')
+        task.dump_images(sorted_hyps, text=caption, path=os.path.join(task.cfg.gen_images_path, 'all_results'))
+        task.dump_images(sorted_hyps, text=caption, path=os.path.join(task.cfg.gen_images_path, 'top1'), topk=1)
+    return results, scores
+def eval_glue(task, generator, models, sample, **kwargs):
+    net_output = models[0](**sample["net_input"])
+    net_output[0].masked_fill_(~sample["constraint_masks"], -math.inf)
+    last_token_ids = sample["net_input"]["prev_output_tokens"].ne(task.src_dict.pad()).sum(1, keepdim=True) - 1
+    logits = net_output[0].gather(1, last_token_ids.unsqueeze(2).expand(-1, -1, net_output[0].size(2)))
+    logits = logits.squeeze(1)
+    predicts = logits.argmax(1).tolist()
+    hyps = [task.bpe.decode(task.src_dict[predict]).strip() for predict in predicts]
+    results = [{"hyp": hyp, "ref": ref_dict.keys()[0]} for hyp, ref_dict in zip(hyps, sample['ref_dict'])]
+    return results, None
+def eval_gigaword(task, generator, models, sample, **kwargs):
+    gen_out = task.inference_step(generator, models, sample)
+    hyps, refs = [], []
+    results = []
+    for i in range(len(gen_out)):
+        hyp = decode_fn(gen_out[i][0]["tokens"], task.tgt_dict, task.bpe, generator).lower().strip()
+        hyp = fix_tokenization(hyp).replace('1', '#')
+        ref = sample['target_strs'][i]
+        hyps.append(hyp)
+        refs.append(ref)
+        results.append({"hyp": hyp, "ref": ref})
+    return results, None
+def eval_image_classify(task, generator, models, sample, **kwargs):
+    batch_size = sample["net_input"]["src_tokens"].size(0)
+    encoder_out = models[0].encoder(
+        sample["net_input"]["src_tokens"],
+        src_lengths=sample["net_input"]["src_lengths"],
+        patch_images=sample["net_input"]["patch_images"],
+        patch_masks=sample["net_input"]["patch_masks"]
+    )
+    device = sample["net_input"]["src_tokens"].device
+    valid_result = []
+    for valid_tgt, valid_prev_output, valid_constraint_masks in zip(task.valid_tgt_list,
+                                                                    task.valid_prev_output_list,
+                                                                    task.valid_constraint_masks_list):
+        valid_tgt_size = valid_tgt.size(0)
+        valid_tgt = valid_tgt.repeat(batch_size, 1).to(device)
+        valid_prev_output = valid_prev_output.repeat(batch_size, 1).to(device)
+        valid_constraint_masks = valid_constraint_masks.repeat(batch_size, 1, 1).to(device)
+        new_encoder_out = {}
+        new_encoder_out["encoder_out"] = [
+            encoder_out["encoder_out"][0].repeat_interleave(valid_tgt_size, dim=1)
+        ]
+        new_encoder_out["encoder_padding_mask"] = [
+            encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_tgt_size, dim=0)
+        ]
+        new_encoder_out["position_embeddings"] = [
+            encoder_out["position_embeddings"][0].repeat_interleave(valid_tgt_size, dim=0)
+        ]
+        decoder_out = models[0].decoder(valid_prev_output, encoder_out=new_encoder_out)
+        decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
+        lprobs = models[0].get_normalized_probs(decoder_out, log_probs=True)
+        scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+        scores = scores.masked_fill(valid_tgt.eq(task.tgt_dict.pad()), 0)
+        scores = scores.sum(1)
+        scores = scores.view(-1, valid_tgt_size)
+        valid_result.append(scores)
+    valid_result = torch.cat(valid_result, dim=-1)
+    predicts = valid_result.argmax(1).tolist()
+    hyps = [task.index2ans[predict_index] for predict_index in predicts]
+    scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
+    results = [{"uniq_id": id, "answer": hyp} for id, hyp in zip(sample["id"].tolist(), hyps)]
+    return results, scores
+def eval_step(task, generator, models, sample, **kwargs):
+    if task.cfg._name == 'caption':
+        return eval_caption(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'vqa_gen':
+        return eval_vqa_gen(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'refcoco':
+        return eval_refcoco(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'snli_ve':
+        return eval_snli_ve(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'image_gen':
+        return eval_image_gen(task, generator, models, sample, **kwargs)
+    elif task.cfg._name in {'cola', 'mnli', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2'}:
+        return eval_glue(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'gigaword':
+        return eval_gigaword(task, generator, models, sample, **kwargs)
+    elif task.cfg._name == 'image_classify':
+        return eval_image_classify(task, generator, models, sample, **kwargs)
+    else:
+        raise NotImplementedError
+def merge_results(task, cfg, logger, score_cnt, score_sum, results):
+    if task.cfg._name == 'image_gen':
+        if cfg.distributed_training.distributed_world_size > 1:
+            dist.all_reduce(score_sum.data)
+            dist.all_reduce(score_cnt.data)
+        if score_cnt.item() > 0:
+            logger.info("score_sum: {}, score_cnt: {}, score: {}".format(
+                score_sum, score_cnt, round(score_sum.item() / score_cnt.item(), 4)
+            ))
+    else:
+        gather_results = None
+        if cfg.distributed_training.distributed_world_size > 1:
+            gather_results = [None for _ in range(dist.get_world_size())]
+            dist.all_gather_object(gather_results, results)
+            dist.all_reduce(score_sum.data)
+            dist.all_reduce(score_cnt.data)
+        if score_cnt.item() > 0:
+            logger.info("score_sum: {}, score_cnt: {}, score: {}".format(
+                score_sum, score_cnt, round(score_sum.item() / score_cnt.item(), 4)
+            ))
+        if cfg.distributed_training.distributed_world_size == 1 or dist.get_rank() == 0:
+            os.makedirs(cfg.common_eval.results_path, exist_ok=True)
+            output_path = os.path.join(cfg.common_eval.results_path, "{}_predict.json".format(cfg.dataset.gen_subset))
+            gather_results = list(chain(*gather_results)) if gather_results is not None else results
+            with open(output_path, 'w') as fw:
+                json.dump(gather_results, fw)

utils/transforms.py ADDED Viewed

	@@ -0,0 +1,513 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import random
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+import numpy as np
+from PIL import Image
+def crop(image, target, region, delete=True):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "polygons" in target:
+        polygons = target["polygons"]
+        num_polygons = polygons.shape[0]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        start_coord = torch.cat([torch.tensor([j, i], dtype=torch.float32)
+                                 for _ in range(polygons.shape[1] // 2)], dim=0)
+        cropped_boxes = polygons - start_coord
+        cropped_boxes = torch.min(cropped_boxes.reshape(num_polygons, -1, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        target["polygons"] = cropped_boxes.reshape(num_polygons, -1)
+        fields.append("polygons")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if delete and ("boxes" in target or "masks" in target):
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+        for field in fields:
+            target[field] = target[field][keep.tolist()]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+    if "polygons" in target:
+        polygons = target["polygons"]
+        num_polygons = polygons.shape[0]
+        polygons = polygons.reshape(num_polygons, -1, 2) * torch.as_tensor([-1, 1]) + torch.as_tensor([w, 0])
+        target["polygons"] = polygons
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if (w <= h and w == size) or (h <= w and h == size):
+            if max_size is not None:
+                max_size = int(max_size)
+                h = min(h, max_size)
+                w = min(w, max_size)
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        if max_size is not None:
+           max_size = int(max_size)
+           oh = min(oh, max_size)
+           ow = min(ow, max_size)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size, interpolation=Image.BICUBIC)
+    if target is None:
+        return rescaled_image
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+    if "polygons" in target:
+        polygons = target["polygons"]
+        scaled_ratio = torch.cat([torch.tensor([ratio_width, ratio_height])
+                                 for _ in range(polygons.shape[1] // 2)], dim=0)
+        scaled_polygons = polygons * scaled_ratio
+        target["polygons"] = scaled_polygons
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        assert False
+        # target['masks'] = interpolate(
+        #     target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+    return rescaled_image, target
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class ObjectCenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        x0 = float(target['boxes'][0][0])
+        y0 = float(target['boxes'][0][1])
+        x1 = float(target['boxes'][0][2])
+        y1 = float(target['boxes'][0][3])
+        center_x = (x0 + x1) / 2
+        center_y = (y0 + y1) / 2
+        crop_left = max(center_x-crop_width/2 + min(image_width-center_x-crop_width/2, 0), 0)
+        crop_top = max(center_y-crop_height/2 + min(image_height-center_y-crop_height/2, 0), 0)
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width), delete=False)
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None, equal=False):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+        self.equal = equal
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        if self.equal:
+            return resize(img, target, size, size)
+        else:
+            return resize(img, target, size, self.max_size)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class Normalize(object):
+    def __init__(self, mean, std, max_image_size=512):
+        self.mean = mean
+        self.std = std
+        self.max_image_size = max_image_size
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        # h, w = image.shape[-2:]
+        h, w = target["size"][0], target["size"][1]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = boxes / self.max_image_size
+            target["boxes"] = boxes
+        if "polygons" in target:
+            polygons = target["polygons"]
+            scale = torch.cat([torch.tensor([w, h], dtype=torch.float32)
+                               for _ in range(polygons.shape[1] // 2)], dim=0)
+            polygons = polygons / scale
+            target["polygons"] = polygons
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+class LargeScaleJitter(object):
+    """
+        implementation of large scale jitter from copy_paste
+    """
+    def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0):
+        self.desired_size = torch.tensor([output_size])
+        self.aug_scale_min = aug_scale_min
+        self.aug_scale_max = aug_scale_max
+    def rescale_target(self, scaled_size, image_size, target):
+        # compute rescaled targets
+        image_scale = scaled_size / image_size
+        ratio_height, ratio_width = image_scale
+        target = target.copy()
+        target["size"] = scaled_size
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+            target["boxes"] = scaled_boxes
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+        if "masks" in target:
+            assert False
+            masks = target['masks']
+            # masks = interpolate(
+            #     masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5
+            target['masks'] = masks
+        return target
+    def crop_target(self, region, target):
+        i, j, h, w = region
+        fields = ["labels", "area"]
+        target = target.copy()
+        target["size"] = torch.tensor([h, w])
+        if "boxes" in target:
+            boxes = target["boxes"]
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+            target["boxes"] = cropped_boxes.reshape(-1, 4)
+            target["area"] = area
+            fields.append("boxes")
+        if "masks" in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append("masks")
+        # remove elements for which the boxes or masks that have zero area
+        if "boxes" in target or "masks" in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+            for field in fields:
+                target[field] = target[field][keep.tolist()]
+        return target
+    def pad_target(self, padding, target):
+        target = target.copy()
+        if "masks" in target:
+            target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[1], 0, padding[0]))
+        return target
+    def __call__(self, image, target=None):
+        image_size = image.size
+        image_size = torch.tensor(image_size[::-1])
+        random_scale = torch.rand(1) * (self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min
+        scaled_size = (random_scale * self.desired_size).round()
+        scale = torch.maximum(scaled_size / image_size[0], scaled_size / image_size[1])
+        scaled_size = (image_size * scale).round().int()
+        scaled_image = F.resize(image, scaled_size.tolist(), interpolation=Image.BICUBIC)
+        if target is not None:
+            target = self.rescale_target(scaled_size, image_size, target)
+        # randomly crop or pad images
+        if random_scale >= 1:
+            # Selects non-zero random offset (x, y) if scaled image is larger than desired_size.
+            max_offset = scaled_size - self.desired_size
+            offset = (max_offset * torch.rand(2)).floor().int()
+            region = (offset[0].item(), offset[1].item(),
+                      self.desired_size[0].item(), self.desired_size[0].item())
+            output_image = F.crop(scaled_image, *region)
+            if target is not None:
+                target = self.crop_target(region, target)
+        else:
+            assert False
+            padding = self.desired_size - scaled_size
+            output_image = F.pad(scaled_image, [0, 0, padding[1].item(), padding[0].item()])
+            if target is not None:
+                target = self.pad_target(padding, target)
+        return output_image, target
+class OriginLargeScaleJitter(object):
+    """
+        implementation of large scale jitter from copy_paste
+    """
+    def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0):
+        self.desired_size = torch.tensor(output_size)
+        self.aug_scale_min = aug_scale_min
+        self.aug_scale_max = aug_scale_max
+    def rescale_target(self, scaled_size, image_size, target):
+        # compute rescaled targets
+        image_scale = scaled_size / image_size
+        ratio_height, ratio_width = image_scale
+        target = target.copy()
+        target["size"] = scaled_size
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+            target["boxes"] = scaled_boxes
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+        if "masks" in target:
+            assert False
+            masks = target['masks']
+            # masks = interpolate(
+            #     masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5
+            target['masks'] = masks
+        return target
+    def crop_target(self, region, target):
+        i, j, h, w = region
+        fields = ["labels", "area"]
+        target = target.copy()
+        target["size"] = torch.tensor([h, w])
+        if "boxes" in target:
+            boxes = target["boxes"]
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+            target["boxes"] = cropped_boxes.reshape(-1, 4)
+            target["area"] = area
+            fields.append("boxes")
+        if "masks" in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append("masks")
+        # remove elements for which the boxes or masks that have zero area
+        if "boxes" in target or "masks" in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+            for field in fields:
+                target[field] = target[field][keep.tolist()]
+        return target
+    def pad_target(self, padding, target):
+        target = target.copy()
+        if "masks" in target:
+            target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[1], 0, padding[0]))
+        return target
+    def __call__(self, image, target=None):
+        image_size = image.size
+        image_size = torch.tensor(image_size[::-1])
+        out_desired_size = (self.desired_size * image_size / max(image_size)).round().int()
+        random_scale = torch.rand(1) * (self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min
+        scaled_size = (random_scale * self.desired_size).round()
+        scale = torch.minimum(scaled_size / image_size[0], scaled_size / image_size[1])
+        scaled_size = (image_size * scale).round().int()
+        scaled_image = F.resize(image, scaled_size.tolist())
+        if target is not None:
+            target = self.rescale_target(scaled_size, image_size, target)
+        # randomly crop or pad images
+        if random_scale > 1:
+            # Selects non-zero random offset (x, y) if scaled image is larger than desired_size.
+            max_offset = scaled_size - out_desired_size
+            offset = (max_offset * torch.rand(2)).floor().int()
+            region = (offset[0].item(), offset[1].item(),
+                      out_desired_size[0].item(), out_desired_size[1].item())
+            output_image = F.crop(scaled_image, *region)
+            if target is not None:
+                target = self.crop_target(region, target)
+        else:
+            padding = out_desired_size - scaled_size
+            output_image = F.pad(scaled_image, [0, 0, padding[1].item(), padding[0].item()])
+            if target is not None:
+                target = self.pad_target(padding, target)
+        return output_image, target
+class RandomDistortion(object):
+    """
+    Distort image w.r.t hue, saturation and exposure.
+    """
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, prob=0.5):
+        self.prob = prob
+        self.tfm = T.ColorJitter(brightness, contrast, saturation, hue)
+    def __call__(self, img, target=None):
+        if np.random.random() < self.prob:
+            return self.tfm(img), target
+        else:
+            return img, target

utils/trie.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from collections import defaultdict
+class TreeNode():
+    def __init__(self):
+        self.child = defaultdict(TreeNode)
+class Trie:
+    def __init__(self, eos):
+        self.root = TreeNode()
+        self.eos = eos
+    def insert(self, word):
+        cur = self.root
+        for c in word:
+            cur = cur.child[c]
+    def get_next_layer(self, word):
+        cur = self.root
+        for c in word:
+            cur = cur.child.get(c)
+            if cur is None:
+                return [self.eos]
+        return list(cur.child.keys())