Spaces:

HarryLee
/

eCommerceImageCaptioning

Runtime error

App Files Files Community

HarryLee commited on May 24, 2022

Commit

c82bc0b

•

1 Parent(s): 19c6711

remove useless file

Browse files

Files changed (4) hide show

tasks/mm_tasks/image_gen.py +0 -329
tasks/mm_tasks/refcoco.py +0 -160
tasks/mm_tasks/snli_ve.py +0 -197
tasks/mm_tasks/vqa_gen.py +0 -278

tasks/mm_tasks/image_gen.py DELETED Viewed

@@ -1,329 +0,0 @@
-# Copyright 2022 The OFA-Sys Team.
-# All rights reserved.
-# This source code is licensed under the Apache 2.0 license
-# found in the LICENSE file in the root directory.
-from dataclasses import dataclass, field
-import json
-import logging
-import os
-import math
-import base64
-from typing import Optional
-from argparse import Namespace
-from omegaconf import DictConfig, OmegaConf
-from torchvision import transforms
-from PIL import Image
-from io import BytesIO
-import torch
-import numpy as np
-from fairseq import metrics
-from fairseq.tasks import register_task
-from fairseq.dataclass import ChoiceEnum
-from models import search, clip
-from models.taming.models.vqgan import GumbelVQ
-from data.mm_data.image_gen_dataset import ImageGenDataset
-from data.file_dataset import FileDataset
-from tasks.ofa_task import OFATask, OFAConfig
-logger = logging.getLogger(__name__)
-def custom_to_pil(x):
-    x = x.detach().cpu()
-    x = torch.clamp(x, -1., 1.)
-    x = (x + 1.) / 2.
-    x = x.permute(1, 2, 0).numpy()
-    x = (255 * x).astype(np.uint8)
-    x = Image.fromarray(x)
-    if not x.mode == "RGB":
-        x = x.convert("RGB")
-    return x
-EVAL_CLIP_METHOD = ChoiceEnum(["ii_sim", "ti_sim"])
-@dataclass
-class ImageGenConfig(OFAConfig):
-    sampling_times: int = field(
-        default=1, metadata={"help": "sample times"}
-    )
-    code_image_size: int = field(
-        default=256, metadata={"help": "code image size"}
-    )
-    # options for reporting CLIP score during validation
-    eval_clip_method: EVAL_CLIP_METHOD = field(
-        default='ti_sim',
-        metadata={
-            "help": "evaluation with CLIP scores. ii_sim means Similarity between generated Images and ref Images, ti_sim means Similarity between generated Images and input Text"}
-    )
-    eval_args: Optional[str] = field(
-        default='{}',
-        metadata={
-            "help": 'generation args for clip scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
-        },
-    )
-    scst: bool = field(
-        default=False, metadata={"help": "Self-critical sequence training"}
-    )
-    scst_args: str = field(
-        default='{}',
-        metadata={
-            "help": 'generation args for Self-critical sequence training, as JSON string'
-        },
-    )
-    vqgan_model_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "path of vqgan model"}
-    )
-    vqgan_config_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "path of vqgan config"}
-    )
-    clip_model_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "clip model path"}
-    )
-    gen_images_path: str = field(
-        default='', metadata={"help": "where to store generated images during evalution. Don't dump images if None. "}
-    )
-@register_task("image_gen", dataclass=ImageGenConfig)
-class ImageGenTask(OFATask):
-    def __init__(self, cfg: ImageGenConfig, src_dict, tgt_dict):
-        super().__init__(cfg, src_dict, tgt_dict)
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        paths = self.cfg.data.split(',')
-        assert len(paths) > 0
-        if split == 'train':
-            file_path = paths[(epoch - 1) % (len(paths) - 1)]
-        else:
-            file_path = paths[-1]
-        dataset = FileDataset(file_path, self.cfg.selected_cols)
-        self.datasets[split] = ImageGenDataset(
-            split,
-            dataset,
-            self.bpe,
-            self.src_dict,
-            self.tgt_dict,
-            max_src_length=self.cfg.max_src_length,
-            code_dict_size=self.cfg.code_dict_size,
-            code_image_size=self.cfg.code_image_size
-        )
-    def build_model(self, cfg):
-        model = super().build_model(cfg)
-        device = torch.cuda.current_device()
-        clip_model, clip_preprocess = clip.load(self.cfg.clip_model_path, device=device)
-        self.clip_model = clip_model
-        self.clip_preprocess = clip_preprocess
-        self.clip_model.to(device)
-        self.clip_model.eval()
-        vqgan_config = OmegaConf.load(self.cfg.vqgan_config_path)
-        vqgan = GumbelVQ(**vqgan_config.model.params)
-        sd = torch.load(self.cfg.vqgan_model_path, map_location="cpu")["state_dict"]
-        missing, unexpected = vqgan.load_state_dict(sd, strict=False)
-        for k, v in vqgan.named_parameters():
-            v.requires_grad = False
-        self.image_tokenizer = vqgan
-        self.image_tokenizer.to(device)
-        self.image_tokenizer.eval()
-        gen_args = json.loads(self.cfg.eval_args)
-        self.sequence_generator = self.build_generator(
-            [model], Namespace(**gen_args)
-        )
-        if self.cfg.scst:
-            scst_args = json.loads(self.cfg.scst_args)
-            self.scst_generator = self.build_generator(
-                [model], Namespace(**scst_args)
-            )
-        return model
-    def build_generator(
-            self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
-    ):
-        """
-        Build a :class:`~fairseq.SequenceGenerator` instance for this
-        task.
-        Args:
-            models (List[~fairseq.models.FairseqModel]): ensemble of models
-            args (fairseq.dataclass.configs.GenerationConfig):
-                configuration object (dataclass) for generation
-            extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass
-                through to SequenceGenerator
-            prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]):
-                If provided, this function constrains the beam search to
-                allowed tokens only at each step. The provided function
-                should take 2 arguments: the batch ID (`batch_id: int`)
-                and a unidimensional tensor of token ids (`inputs_ids:
-                torch.Tensor`). It has to return a `List[int]` with the
-                allowed tokens for the next generation step conditioned
-                on the previously generated tokens (`inputs_ids`) and
-                the batch ID (`batch_id`). This argument is useful for
-                constrained generation conditioned on the prefix, as
-                described in "Autoregressive Entity Retrieval"
-                (https://arxiv.org/abs/2010.00904) and
-                https://github.com/facebookresearch/GENRE.
-        """
-        from models.sequence_generator import SequenceGenerator
-        # Choose search strategy. Defaults to Sampling.
-        self.sampling_times = self.cfg.sampling_times
-        sampling = True  # we have to use sampling instead of beam search in image generation task
-        sampling_topk = getattr(args, "sampling_topk", -1)
-        sampling_topp = getattr(args, "sampling_topp", -1.0)
-        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
-        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
-        search_strategy = search.Sampling(
-            self.target_dictionary, sampling_topk, sampling_topp
-        )
-        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
-        return SequenceGenerator(
-            models,
-            self.target_dictionary,
-            beam_size=getattr(args, "beam", 5),
-            max_len_a=getattr(args, "max_len_a", 0),
-            max_len_b=getattr(args, "max_len_b", 200),
-            min_len=getattr(args, "min_len", 1),
-            normalize_scores=(not getattr(args, "unnormalized", False)),
-            len_penalty=getattr(args, "lenpen", 1),
-            unk_penalty=getattr(args, "unkpen", 0),
-            temperature=getattr(args, "temperature", 1.0),
-            match_source_len=getattr(args, "match_source_len", False),
-            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
-            search_strategy=search_strategy,
-            constraint_range=self.cfg.constraint_range,
-            gen_code=True,
-            **extra_gen_cls_kwargs,
-        )
-    def compute_ref_image_similarity(self, hyps, ref, device):
-        hyp_images = torch.stack(
-            [self.clip_preprocess(hyp_image) for hyp_image in hyps], dim=0
-        ).to(device)
-        ref_images = self.clip_preprocess(ref).unsqueeze(0).to(device)
-        with torch.no_grad():
-            hyp_image_features = self.clip_model.encode_image(hyp_images)
-            ref_image_features = self.clip_model.encode_image(ref_images)
-        hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
-        ref_image_features /= ref_image_features.norm(dim=-1, keepdim=True)
-        similarity = hyp_image_features @ ref_image_features.T
-        # scores.append(similarity.max().item())
-        sorted_score, indices = torch.sort(similarity.view(-1), descending=True)
-        return sorted_score, indices
-    def compute_text_similarity(self, hyps, text, device):
-        hyp_images = torch.stack(
-            [self.clip_preprocess(hyp_image) for hyp_image in hyps], dim=0
-        ).to(device)
-        clip_input = clip.tokenize([text]).to(device)
-        with torch.no_grad():
-            hyp_image_features = self.clip_model.encode_image(hyp_images)
-            hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
-            text_features = self.clip_model.encode_text(clip_input)
-            text_features /= text_features.norm(dim=-1, keepdim=True)
-        ti_similarity = hyp_image_features @ text_features.T
-        sorted_score, indices = torch.sort(ti_similarity.view(-1), descending=True)
-        return sorted_score, indices
-    def valid_step(self, sample, model, criterion):
-        loss, sample_size, logging_output = criterion(model, sample)
-        model.eval()
-        device = sample['target'].device
-        hyps, ref = self.inference_image(self.sequence_generator, sample, [model])
-        scores = []
-        tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
-        caption = self.bpe.decode(self.tgt_dict.string([token for token in tokens if token >= 4]))[
-                  38:].replace('/', '')
-        if self.cfg.eval_clip_method == 'ii_sim':
-            similarity_score, indices = self.compute_ref_image_similarity(hyps, ref, device)
-        elif self.cfg.eval_clip_method == 'ti_sim':
-            similarity_score, indices = self.compute_text_similarity(hyps, caption, device)
-        else:
-            raise ValueError("unsupported eval method.")
-        scores.append(similarity_score.max().item())
-        sorted_hyps = [hyps[indice] for indice in indices]
-        if self.cfg.gen_images_path:
-            caption_tokens = sample['net_input']['src_tokens'][0].view(-1).tolist()
-            caption = self.bpe.decode(self.tgt_dict.string([token for token in caption_tokens if token >= 4]))[
-                      38:].replace('/', '')
-            self.dump_images(sorted_hyps, text=caption, path=os.path.join(self.cfg.gen_images_path, 'all_results'))
-            self.dump_images(sorted_hyps, text=caption, path=os.path.join(self.cfg.gen_images_path, 'top1'), topk=1)
-        logging_output["_score_sum"] = sum(scores)
-        logging_output["_score_cnt"] = len(scores)
-        return loss, sample_size, logging_output
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-        def sum_logs(key):
-            import torch
-            result = sum(log.get(key, 0) for log in logging_outputs)
-            if torch.is_tensor(result):
-                result = result.cpu()
-            return result
-        def compute_score(meters):
-            score = meters["_score_sum"].sum / meters["_score_cnt"].sum
-            score = score if isinstance(score, float) else score.item()
-            return round(score, 3)
-        if sum_logs("_score_cnt") > 0:
-            metrics.log_scalar("_score_sum", sum_logs("_score_sum"))
-            metrics.log_scalar("_score_cnt", sum_logs("_score_cnt"))
-            metrics.log_derived("score", compute_score)
-    def inference_image(self, generator, sample, models):
-        hyps, ref = [], None
-        for j in range(self.sampling_times):
-            gen_out = self.inference_step(generator, models, sample)
-            for i in range(len(gen_out)):
-                with torch.no_grad():
-                    tokens = torch.stack([item['tokens'][:-1] for item in gen_out[i]], dim=0)
-                    tokens += -len(self.src_dict) + self.cfg.code_dict_size + self.cfg.num_bins
-                    images = self.image_tokenizer.decode_code(
-                        tokens.view(-1, self.cfg.code_image_size // 8, self.cfg.code_image_size // 8)
-                    )
-                    images = [custom_to_pil(image) for image in images]
-                hyps += images
-        if 'code_images' in sample:
-            ref = Image.open(BytesIO(base64.urlsafe_b64decode(sample['code_images'][0]))).convert('RGB')
-        return hyps, ref
-    def dump_images(self, images, text, path, topk=None):
-        os.makedirs(path, exist_ok=True)
-        if topk:
-            images = images[:topk]
-        for j, image in enumerate(images):
-            save_path = os.path.join(path, f'{text}_{j}.png')
-            image.save(save_path)

tasks/mm_tasks/refcoco.py DELETED Viewed

@@ -1,160 +0,0 @@
-# Copyright 2022 The OFA-Sys Team.
-# All rights reserved.
-# This source code is licensed under the Apache 2.0 license
-# found in the LICENSE file in the root directory.
-from dataclasses import dataclass, field
-import json
-import logging
-from typing import Optional
-from argparse import Namespace
-import torch
-from fairseq import metrics
-from fairseq.tasks import register_task
-from tasks.ofa_task import OFATask, OFAConfig
-from data.mm_data.refcoco_dataset import RefcocoDataset
-from data.file_dataset import FileDataset
-logger = logging.getLogger(__name__)
-@dataclass
-class RefcocoConfig(OFAConfig):
-    eval_acc: bool = field(
-        default=False, metadata={"help": "evaluation with accuracy"}
-    )
-    eval_args: Optional[str] = field(
-        default='{}',
-        metadata={
-            "help": 'generation args, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
-        },
-    )
-    eval_print_samples: bool = field(
-        default=False, metadata={"help": "print sample generations during validation"}
-    )
-    max_image_size: int = field(
-        default=512, metadata={"help": "max image size for normalization"}
-    )
-    scst: bool = field(
-        default=False, metadata={"help": "Self-critical sequence training"}
-    )
-    scst_args: str = field(
-        default='{}',
-        metadata={
-            "help": 'generation args for Self-critical sequence training, as JSON string'
-        },
-    )
-@register_task("refcoco", dataclass=RefcocoConfig)
-class RefcocoTask(OFATask):
-    def __init__(self, cfg: RefcocoConfig, src_dict, tgt_dict):
-        super().__init__(cfg, src_dict, tgt_dict)
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        paths = self.cfg.data.split(',')
-        assert len(paths) > 0
-        if split == 'train':
-            file_path = paths[(epoch - 1) % (len(paths) - 1)]
-        else:
-            file_path = paths[-1]
-        dataset = FileDataset(file_path, self.cfg.selected_cols)
-        self.datasets[split] = RefcocoDataset(
-            split,
-            dataset,
-            self.bpe,
-            self.src_dict,
-            self.tgt_dict,
-            max_src_length=self.cfg.max_src_length,
-            max_tgt_length=self.cfg.max_tgt_length,
-            patch_image_size=self.cfg.patch_image_size,
-            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
-            num_bins=self.cfg.num_bins,
-            max_image_size=self.cfg.max_image_size
-        )
-    def build_model(self, cfg):
-        model = super().build_model(cfg)
-        if self.cfg.eval_acc:
-            gen_args = json.loads(self.cfg.eval_args)
-            self.sequence_generator = self.build_generator(
-                [model], Namespace(**gen_args)
-            )
-        if self.cfg.scst:
-            scst_args = json.loads(self.cfg.scst_args)
-            self.scst_generator = self.build_generator(
-                [model], Namespace(**scst_args)
-            )
-        return model
-    def _calculate_ap_score(self, hyps, refs, thresh=0.5):
-        interacts = torch.cat(
-            [torch.where(hyps[:, :2] < refs[:, :2], refs[:, :2], hyps[:, :2]),
-             torch.where(hyps[:, 2:] < refs[:, 2:], hyps[:, 2:], refs[:, 2:])],
-            dim=1
-        )
-        area_predictions = (hyps[:, 2] - hyps[:, 0]) * (hyps[:, 3] - hyps[:, 1])
-        area_targets = (refs[:, 2] - refs[:, 0]) * (refs[:, 3] - refs[:, 1])
-        interacts_w = interacts[:, 2] - interacts[:, 0]
-        interacts_h = interacts[:, 3] - interacts[:, 1]
-        area_interacts = interacts_w * interacts_h
-        ious = area_interacts / (area_predictions + area_targets - area_interacts + 1e-6)
-        return ((ious >= thresh) & (interacts_w > 0) & (interacts_h > 0)).float()
-    def valid_step(self, sample, model, criterion):
-        loss, sample_size, logging_output = criterion(model, sample)
-        model.eval()
-        if self.cfg.eval_acc:
-            hyps, refs = self._inference(self.sequence_generator, sample, model)
-            hyps = hyps / (self.cfg.num_bins - 1) * self.cfg.max_image_size
-            refs = refs / (self.cfg.num_bins - 1) * self.cfg.max_image_size
-            hyps[:, ::2] /= sample['w_resize_ratios'].unsqueeze(1)
-            hyps[:, 1::2] /= sample['h_resize_ratios'].unsqueeze(1)
-            refs[:, ::2] /= sample['w_resize_ratios'].unsqueeze(1)
-            refs[:, 1::2] /= sample['h_resize_ratios'].unsqueeze(1)
-            # scores = self._calculate_ap_score(hyps, refs)
-            scores = self._calculate_ap_score(hyps, sample['region_coords'].float())
-            logging_output["_score_sum"] = scores.sum().item()
-            logging_output["_score_cnt"] = scores.size(0)
-        return loss, sample_size, logging_output
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-        def sum_logs(key):
-            import torch
-            result = sum(log.get(key, 0) for log in logging_outputs)
-            if torch.is_tensor(result):
-                result = result.cpu()
-            return result
-        def compute_score(meters):
-            score = meters["_score_sum"].sum / meters["_score_cnt"].sum
-            score = score if isinstance(score, float) else score.item()
-            return round(score, 4)
-        if sum_logs("_score_cnt") > 0:
-            metrics.log_scalar("_score_sum", sum_logs("_score_sum"))
-            metrics.log_scalar("_score_cnt", sum_logs("_score_cnt"))
-            metrics.log_derived("score", compute_score)
-    def _inference(self, generator, sample, model):
-        gen_out = self.inference_step(generator, [model], sample)
-        hyps, refs = [], []
-        for i in range(len(gen_out)):
-            hyps.append(gen_out[i][0]["tokens"][:-1] - len(self.src_dict) + self.cfg.num_bins)
-            refs.append(sample["target"][i][:-1] - len(self.src_dict) + self.cfg.num_bins)
-        if self.cfg.eval_print_samples:
-            logger.info("example hypothesis: ", hyps[0])
-            logger.info("example reference: ", refs[0])
-        return torch.stack(hyps, dim=0), torch.stack(refs, dim=0)

tasks/mm_tasks/snli_ve.py DELETED Viewed

@@ -1,197 +0,0 @@
-# Copyright 2022 The OFA-Sys Team.
-# All rights reserved.
-# This source code is licensed under the Apache 2.0 license
-# found in the LICENSE file in the root directory.
-import json
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Optional
-import torch
-from fairseq import metrics
-from fairseq.tasks import register_task
-from tasks.ofa_task import OFAConfig, OFATask
-from data.mm_data.snli_ve_dataset import SnliVeDataset
-from data.file_dataset import FileDataset
-from data import data_utils
-from utils.trie import Trie
-logger = logging.getLogger(__name__)
-@dataclass
-class SnliVeConfig(OFAConfig):
-    ans2label_dict: Optional[str] = field(
-        default='{"no": 0, "yes":1, "maybe": 2}',
-        metadata={"help": 'answer to label dict'},
-    )
-    add_caption: bool = field(
-        default=False,
-        metadata={"help": "add caption to encoder"},
-    )
-    valid_batch_size: int = field(
-        default=20,
-        metadata={"help": "valid batch size per step"},
-    )
-    prompt_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "prompt_type"},
-    )
-@register_task("snli_ve", dataclass=SnliVeConfig)
-class SnliVeTask(OFATask):
-    def __init__(self, cfg: SnliVeConfig, src_dict, tgt_dict):
-        super().__init__(cfg, src_dict, tgt_dict)
-        self.ans2label_dict = json.loads(self.cfg.ans2label_dict)
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        paths = self.cfg.data.split(',')
-        assert len(paths) > 0
-        if split == 'train':
-            file_path = paths[(epoch - 1) % (len(paths) - 1)]
-        else:
-            file_path = paths[-1]
-        dataset = FileDataset(file_path, self.cfg.selected_cols)
-        self.datasets[split] = SnliVeDataset(
-            split,
-            dataset,
-            self.bpe,
-            self.src_dict,
-            self.tgt_dict,
-            max_src_length=self.cfg.max_src_length,
-            max_tgt_length=self.cfg.max_tgt_length,
-            patch_image_size=self.cfg.patch_image_size,
-            add_caption=self.cfg.add_caption,
-            constraint_trie=self.constraint_trie,
-            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
-            prompt_type=self.cfg.prompt_type
-        )
-    def build_model(self, cfg):
-        model = super().build_model(cfg)
-        answer_item_list = []
-        self.index2ans = {}
-        self.constraint_trie = Trie(self.tgt_dict.eos())
-        for i, answer in enumerate(self.ans2label_dict.keys()):
-            answer_item = self.tgt_dict.encode_line(
-                line=self.bpe.encode(' ' + answer),
-                add_if_not_exist=False,
-                append_eos=False
-            ).long()
-            answer_item_list.append(answer_item)
-            self.index2ans[i] = answer
-            self.constraint_trie.insert([self.tgt_dict.bos()] + answer_item.tolist() + [self.tgt_dict.eos()])
-        constraint_mask_list = []
-        for answer_item in answer_item_list:
-            constraint_mask = torch.zeros((len(answer_item)+1, len(self.tgt_dict))).bool()
-            for i in range(len(answer_item)+1):
-                constraint_prefix_token = [self.src_dict.bos()] + answer_item[:i].tolist()
-                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
-                constraint_mask[i][constraint_nodes] = True
-            constraint_mask_list.append(constraint_mask)
-        self.valid_answers_list = []
-        self.valid_constraint_masks_list = []
-        for i in range(0, len(answer_item_list), self.cfg.valid_batch_size):
-            self.valid_answers_list += [answer_item_list[i:i+self.cfg.valid_batch_size]]
-            self.valid_constraint_masks_list += [constraint_mask_list[i:i+self.cfg.valid_batch_size]]
-        return model
-    def build_generator(
-        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
-    ):
-        seq_generator = super().build_generator(models, args, seq_gen_cls, extra_gen_cls_kwargs, prefix_allowed_tokens_fn)
-        seq_generator.constraint_trie = self.constraint_trie
-        return seq_generator
-    def valid_step(self, sample, model, criterion, **extra_kwargs):
-        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
-        model.eval()
-        with torch.no_grad():
-            encoder_out = model.encoder(
-                sample["net_input"]["src_tokens"],
-                src_lengths=sample["net_input"]["src_lengths"],
-                patch_images=sample["net_input"]["patch_images"],
-                patch_masks=sample["net_input"]["patch_masks"]
-            )
-            device = sample["net_input"]["src_tokens"].device
-            eos_item = torch.tensor([self.src_dict.eos()])
-            pad = self.src_dict.pad()
-            valid_result = []
-            for valid_answers, valid_constraint_masks in zip(self.valid_answers_list, self.valid_constraint_masks_list):
-                valid_size = len(valid_answers)
-                valid_tgt_items = [
-                    torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
-                    for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
-                ]
-                valid_prev_items = [
-                    torch.cat([torch.tensor(decoder_prompt), valid_answer])
-                    for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
-                ]
-                valid_constraint_mask_items = [
-                    torch.cat([torch.zeros(len(decoder_prompt)-1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask], dim=0)
-                    for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
-                ]
-                valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad, left_pad=False).to(device)
-                valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad, left_pad=False).to(device)
-                valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad, left_pad=False).to(device)
-                new_encoder_out = {}
-                new_encoder_out["encoder_out"] = [
-                    encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
-                ]
-                new_encoder_out["encoder_padding_mask"] = [
-                    encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
-                ]
-                new_encoder_out["position_embeddings"] = [
-                    encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
-                ]
-                decoder_out = model.decoder(valid_prev_output, encoder_out=new_encoder_out)
-                decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
-                lprobs = model.get_normalized_probs(decoder_out, log_probs=True)
-                scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
-                scores = scores.masked_fill(valid_tgt.eq(self.tgt_dict.pad()), 0)
-                scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
-                scores = scores.sum(1)
-                scores = scores.view(-1, valid_size)
-                valid_result.append(scores)
-        valid_result = torch.cat(valid_result, dim=-1)
-        predicts = valid_result.argmax(1).tolist()
-        hyps = [self.index2ans[predict_index] for predict_index in predicts]
-        scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
-        logging_output["_snli_score_sum"] = sum(scores)
-        logging_output["_snli_cnt"] = len(scores)
-        return loss, sample_size, logging_output
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-        def sum_logs(key):
-            import torch
-            result = sum(log.get(key, 0) for log in logging_outputs)
-            if torch.is_tensor(result):
-                result = result.cpu()
-            return result
-        def compute_score(meters):
-            score = meters["_snli_score_sum"].sum / meters["_snli_cnt"].sum
-            score = score if isinstance(score, float) else score.item()
-            return round(score, 4)
-        if sum_logs("_snli_cnt") > 0:
-            metrics.log_scalar("_snli_score_sum", sum_logs("_snli_score_sum"))
-            metrics.log_scalar("_snli_cnt", sum_logs("_snli_cnt"))
-            metrics.log_derived("snli_score", compute_score)

tasks/mm_tasks/vqa_gen.py DELETED Viewed

@@ -1,278 +0,0 @@
-# Copyright 2022 The OFA-Sys Team.
-# All rights reserved.
-# This source code is licensed under the Apache 2.0 license
-# found in the LICENSE file in the root directory.
-from dataclasses import dataclass, field
-import json
-import logging
-import os
-import math
-import pickle
-from typing import Optional
-from argparse import Namespace
-from data.file_dataset import FileDataset
-import torch
-from fairseq import metrics
-from fairseq.tasks import register_task
-from models import search
-from data.mm_data.vqa_gen_dataset import VqaGenDataset
-from data import data_utils
-from tasks.ofa_task import OFAConfig, OFATask
-from utils.trie import Trie
-logger = logging.getLogger(__name__)
-def get_symbols_to_strip_from_output(generator):
-    if hasattr(generator, "symbols_to_strip_from_output"):
-        return generator.symbols_to_strip_from_output
-    else:
-        return {generator.bos, generator.eos}
-def decode_fn(x, tgt_dict, bpe, generator, tokenizer=None):
-    x = tgt_dict.string(x.int().cpu(), extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator))
-    if bpe is not None:
-        x = bpe.decode(x)
-    if tokenizer is not None:
-        x = tokenizer.decode(x)
-    return x
-@dataclass
-class VqaGenConfig(OFAConfig):
-    max_object_length: int = field(
-        default=30, metadata={"help": "the maximum object sequence length"}
-    )
-    ans2label_dict: Optional[str] = field(
-        default='{"no": 0, "yes":1}',
-        metadata={"help": 'answer to label dict'},
-    )
-    ans2label_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "path to load ans2label file"},
-    )
-    add_object: bool = field(
-        default=False,
-        metadata={"help": "add object to encoder"},
-    )
-    valid_batch_size: int = field(
-        default=20,
-        metadata={"help": "valid batch size per step"},
-    )
-    prompt_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "prompt_type"},
-    )
-    uses_ema: Optional[bool] = field(
-        default=False,
-        metadata={"help": "whether to use ema"},
-    )
-    val_inference_type: Optional[str] = field(
-        default='allcand',
-        metadata={"help": "inference type in validation (allcand or beamsearch), default to allcand"},
-    )
-    eval_args: Optional[str] = field(
-        default='{"beam":5,"unnormalized":true,"temperature":1.0}',
-        metadata={
-            "help": 'generation args as JSON string for inference, only activated when --val-inference-type=beamsearch'
-        },
-    )
-@register_task("vqa_gen", dataclass=VqaGenConfig)
-class VqaGenTask(OFATask):
-    def __init__(self, cfg: VqaGenConfig, src_dict, tgt_dict):
-        super().__init__(cfg, src_dict, tgt_dict)
-        self.ans2label_dict = None
-        if self.cfg.ans2label_file is not None:
-            self.ans2label_dict = pickle.load(open(self.cfg.ans2label_file, "rb"))
-        else:
-            self.ans2label_dict = json.loads(self.cfg.ans2label_dict)
-        self.uses_ema = self.cfg.uses_ema
-        assert self.cfg.val_inference_type in ["allcand", "beamsearch"], \
-            "Unknown inference type encountered: {}, should be allcand or beamsearch.".format(self.cfg.val_inference_type)
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        paths = self.cfg.data.split(',')
-        assert len(paths) > 0
-        if split == 'train':
-            table_path = paths[(epoch - 1) % (len(paths) - 1)]
-        else:
-            table_path = paths[-1]
-        dataset = FileDataset(table_path, self.cfg.selected_cols)
-        self.datasets[split] = VqaGenDataset(
-            split,
-            dataset,
-            self.bpe,
-            self.src_dict,
-            self.tgt_dict,
-            max_src_length=self.cfg.max_src_length,
-            max_object_length=self.cfg.max_object_length,
-            max_tgt_length=self.cfg.max_tgt_length,
-            patch_image_size=self.cfg.patch_image_size,
-            add_object=self.cfg.add_object,
-            constraint_trie=self.constraint_trie,
-            imagenet_default_mean_and_std=self.cfg.imagenet_default_mean_and_std,
-            prompt_type=self.cfg.prompt_type
-        )
-    def build_model(self, cfg):
-        model = super().build_model(cfg)
-        answer_item_list = []
-        self.index2ans = {}
-        self.constraint_trie = Trie(self.tgt_dict.eos())
-        for i, answer in enumerate(self.ans2label_dict.keys()):
-            answer_item = self.tgt_dict.encode_line(
-                line=self.bpe.encode(' ' + answer),
-                add_if_not_exist=False,
-                append_eos=False
-            ).long()
-            answer_item_list.append(answer_item)
-            self.index2ans[i] = answer
-            self.constraint_trie.insert([self.tgt_dict.bos()] + answer_item.tolist() + [self.tgt_dict.eos()])
-        constraint_mask_list = []
-        for answer_item in answer_item_list:
-            constraint_mask = torch.zeros((len(answer_item)+1, len(self.tgt_dict))).bool()
-            for i in range(len(answer_item)+1):
-                constraint_prefix_token = [self.src_dict.bos()] + answer_item[:i].tolist()
-                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
-                constraint_mask[i][constraint_nodes] = True
-            constraint_mask_list.append(constraint_mask)
-        if self.cfg.val_inference_type == "allcand":
-            self.valid_answers_list = []
-            self.valid_constraint_masks_list = []
-            for i in range(0, len(answer_item_list), self.cfg.valid_batch_size):
-                self.valid_answers_list += [answer_item_list[i:i+self.cfg.valid_batch_size]]
-                self.valid_constraint_masks_list += [constraint_mask_list[i:i+self.cfg.valid_batch_size]]
-        elif self.cfg.val_inference_type == "beamsearch":
-            gen_args = json.loads(self.cfg.eval_args)
-            self.generator = self.build_generator(
-                [model], Namespace(**gen_args)
-            )
-        else:
-            raise NotImplementedError("Error: Unknown inference type encountered.")
-        return model
-    def build_generator(
-        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
-    ):
-        seq_generator = super().build_generator(models, args, seq_gen_cls, extra_gen_cls_kwargs, prefix_allowed_tokens_fn)
-        seq_generator.constraint_trie = self.constraint_trie
-        return seq_generator
-    def valid_step(self, sample, model, criterion, **extra_kwargs):
-        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
-        if self.uses_ema:
-            assert 'ema_model' in extra_kwargs and extra_kwargs['ema_model'] is not None
-        if self.uses_ema:
-            eval_model = extra_kwargs['ema_model']
-        else:
-            eval_model = model
-        eval_model.eval()
-        with torch.no_grad():
-            if self.cfg.val_inference_type == "allcand":
-                encoder_out = eval_model.encoder(
-                    sample["net_input"]["src_tokens"],
-                    src_lengths=sample["net_input"]["src_lengths"],
-                    patch_images=sample["net_input"]["patch_images"],
-                    patch_masks=sample["net_input"]["patch_masks"]
-                )
-                device = sample["net_input"]["src_tokens"].device
-                eos_item = torch.tensor([self.src_dict.eos()])
-                pad = self.src_dict.pad()
-                valid_result = []
-                for valid_answers, valid_constraint_masks in zip(self.valid_answers_list, self.valid_constraint_masks_list):
-                    valid_size = len(valid_answers)
-                    valid_tgt_items = [
-                        torch.cat([torch.tensor(decoder_prompt[1:]), valid_answer, eos_item])
-                        for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
-                    ]
-                    valid_prev_items = [
-                        torch.cat([torch.tensor(decoder_prompt), valid_answer])
-                        for decoder_prompt in sample["decoder_prompts"] for valid_answer in valid_answers
-                    ]
-                    valid_constraint_mask_items = [
-                        torch.cat([torch.zeros(len(decoder_prompt)-1, valid_constraint_mask.size(1)).bool(), valid_constraint_mask], dim=0)
-                        for decoder_prompt in sample["decoder_prompts"] for valid_constraint_mask in valid_constraint_masks
-                    ]
-                    valid_tgt = data_utils.collate_tokens(valid_tgt_items, pad_idx=pad, left_pad=False).to(device)
-                    valid_prev_output = data_utils.collate_tokens(valid_prev_items, pad_idx=pad, left_pad=False).to(device)
-                    valid_constraint_masks = data_utils.collate_tokens(valid_constraint_mask_items, pad_idx=pad, left_pad=False).to(device)
-                    new_encoder_out = {}
-                    new_encoder_out["encoder_out"] = [
-                        encoder_out["encoder_out"][0].repeat_interleave(valid_size, dim=1)
-                    ]
-                    new_encoder_out["encoder_padding_mask"] = [
-                        encoder_out["encoder_padding_mask"][0].repeat_interleave(valid_size, dim=0)
-                    ]
-                    new_encoder_out["position_embeddings"] = [
-                        encoder_out["position_embeddings"][0].repeat_interleave(valid_size, dim=0)
-                    ]
-                    decoder_out = eval_model.decoder(valid_prev_output, encoder_out=new_encoder_out)
-                    decoder_out[0].masked_fill_(~valid_constraint_masks, -math.inf)
-                    lprobs = eval_model.get_normalized_probs(decoder_out, log_probs=True)
-                    scores = lprobs.gather(dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
-                    scores = scores.masked_fill(valid_tgt.eq(self.tgt_dict.pad()), 0)
-                    scores = scores.masked_fill((~valid_constraint_masks).all(2), 0)
-                    scores = scores.sum(1)
-                    scores = scores.view(-1, valid_size)
-                    valid_result.append(scores)
-                valid_result = torch.cat(valid_result, dim=-1)
-                predicts = valid_result.argmax(1).tolist()
-                hyps = [self.index2ans[predict_index] for predict_index in predicts]
-            elif self.cfg.val_inference_type == "beamsearch":
-                raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
-                hyps = []
-                for i, sample_id in enumerate(sample["id"].tolist()):
-                    prefix_len = sample['prefix_tokens'][i].ne(1).sum().item()
-                    detok_hypo_str = decode_fn(raw_hyps[i][0]["tokens"][prefix_len:], self.tgt_dict, self.bpe, self.generator)
-                    hyps.append(detok_hypo_str.strip())
-            else:
-                raise NotImplementedError("Error: Unknown inference type encountered.")
-        scores = [ref_dict.get(hyp, 0) for ref_dict, hyp in zip(sample['ref_dict'], hyps)]
-        logging_output["_vqa_score_sum"] = sum(scores)
-        logging_output["_vqa_cnt"] = len(scores)
-        return loss, sample_size, logging_output
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-        def sum_logs(key):
-            import torch
-            result = sum(log.get(key, 0) for log in logging_outputs)
-            if torch.is_tensor(result):
-                result = result.cpu()
-            return result
-        def compute_score(meters):
-            score = meters["_vqa_score_sum"].sum / meters["_vqa_cnt"].sum
-            score = score if isinstance(score, float) else score.item()
-            return round(score, 4)
-        if sum_logs("_vqa_cnt") > 0:
-            metrics.log_scalar("_vqa_score_sum", sum_logs("_vqa_score_sum"))
-            metrics.log_scalar("_vqa_cnt", sum_logs("_vqa_cnt"))
-            metrics.log_derived("vqa_score", compute_score)