Spaces:

OFA-Sys
/

OFA-OCR

Runtime error

File size: 9,900 Bytes

ee21b96

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import logging
import os
from dataclasses import dataclass, field
from typing import List, Optional, Tuple

import torch
from fairseq import utils
from fairseq.data import (
    Dictionary,
    TokenBlockDataset,
    data_utils,
    iterators,
)
from fairseq.dataclass import FairseqDataclass
from fairseq.distributed import utils as dist_utils
from fairseq.tasks import FairseqTask, register_task
from omegaconf import II


logger = logging.getLogger(__name__)


@dataclass
class TruncatedBPTTLMConfig(FairseqDataclass):
    data: str = field(default="???", metadata={"help": "path to data directory"})
    tokens_per_sample: int = field(
        default=1024,
        metadata={"help": "max number of tokens per sequence"},
    )
    batch_size: int = II("dataset.batch_size")
    # Some models use *max_target_positions* to know how many positional
    # embeddings to learn. We use II(...) to make it default to
    # *tokens_per_sample*, but in principle there could be more positional
    # embeddings than tokens in a single batch. This may also be irrelevant for
    # custom model implementations.
    max_target_positions: int = II("task.tokens_per_sample")
    # these will be populated automatically if not provided
    data_parallel_rank: Optional[int] = None
    data_parallel_size: Optional[int] = None


@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
class TruncatedBPTTLMTask(FairseqTask):
    def __init__(self, cfg: TruncatedBPTTLMConfig):
        super().__init__(cfg)

        if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
            if torch.distributed.is_initialized():
                cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
                cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
            else:
                cfg.data_parallel_rank = 0
                cfg.data_parallel_size = 1

        # load the dictionary
        paths = utils.split_paths(cfg.data)
        assert len(paths) > 0
        self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
        logger.info("dictionary: {} types".format(len(self.dictionary)))

    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)"""

        # support sharded datasets
        paths = utils.split_paths(self.cfg.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        # each element of *data* will be a tensorized line from the original
        # text dataset, similar to ``open(split_path).readlines()``
        data = data_utils.load_indexed_dataset(
            split_path, self.dictionary, combine=combine
        )
        if data is None:
            raise FileNotFoundError(
                "Dataset not found: {} ({})".format(split, split_path)
            )

        # this is similar to ``data.view(-1).split(tokens_per_sample)``
        data = TokenBlockDataset(
            data,
            data.sizes,
            block_size=self.cfg.tokens_per_sample,
            pad=None,  # unused
            eos=None,  # unused
            break_mode="none",
        )

        self.datasets[split] = TruncatedBPTTDataset(
            data=data,
            bsz_per_shard=self.cfg.batch_size,
            shard_id=self.cfg.data_parallel_rank,
            num_shards=self.cfg.data_parallel_size,
        )

    def dataset(self, split):
        return self.datasets[split]

    def get_batch_iterator(
        self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs
    ):
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=self._collate_fn,
            num_workers=num_workers,
            epoch=epoch,
            buffer_size=data_buffer_size,
            # we don't use the batching functionality from EpochBatchIterator;
            # instead every item in *dataset* is a whole batch
            batch_sampler=[[i] for i in range(len(dataset))],
            disable_shuffling=True,
        )

    def _collate_fn(self, items: List[List[torch.Tensor]]):
        # we don't use fairseq's batching functionality, so we expect a single
        # Tensor of type List[torch.Tensor]
        assert len(items) == 1

        # item will have shape B x T (the last batch may have length < T)
        id, item = items[0]
        item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
        B, T = item.size()

        # shift item one position over and append a padding token for the target
        target = torch.nn.functional.pad(
            item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
        )

        # fairseq expects batches to have the following structure
        return {
            "id": torch.tensor([id]*item.size(0)),
            "net_input": {
                "src_tokens": item,
            },
            "target": target,
            "nsentences": item.size(0),
            "ntokens": item.numel(),
        }

    def build_dataset_for_inference(
        self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
    ) -> torch.utils.data.Dataset:
        eos = self.source_dictionary.eos()
        dataset = TokenBlockDataset(
            src_tokens,
            src_lengths,
            block_size=None,  # ignored for "eos" break mode
            pad=self.source_dictionary.pad(),
            eos=eos,
            break_mode="eos",
        )

        class Dataset(torch.utils.data.Dataset):
            def __getitem__(self, i):
                item = dataset[i]
                if item[-1] == eos:
                    # remove eos to support generating with a prefix
                    item = item[:-1]
                return (i, [item])

            def __len__(self):
                return len(dataset)

        return Dataset()

    def inference_step(
        self, generator, models, sample, prefix_tokens=None, constraints=None
    ):
        with torch.no_grad():
            if constraints is not None:
                raise NotImplementedError

            # SequenceGenerator doesn't use *src_tokens* directly, we need to
            # pass the *prefix_tokens* argument instead.
            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
                prefix_tokens = sample["net_input"]["src_tokens"]

            # begin generation with the end-of-sentence token
            bos_token = self.source_dictionary.eos()

            return generator.generate(
                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
            )

    def eval_lm_dataloader(
        self,
        dataset,
        max_tokens: Optional[int] = 36000,
        batch_size: Optional[int] = None,
        max_positions: Optional[int] = None,
        num_shards: int = 1,
        shard_id: int = 0,
        num_workers: int = 1,
        data_buffer_size: int = 10,
        context_window: int = 0,
    ):
        if context_window > 0:
            raise NotImplementedError(
                "Transformer-XL doesn't need --context-window, try "
                "--model-overrides '{\"mem_len\":42}' instead "
            )
        return self.get_batch_iterator(
            dataset=dataset,
            max_tokens=max_tokens,
            max_sentences=batch_size,
            max_positions=max_positions,
            ignore_invalid_inputs=True,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            data_buffer_size=data_buffer_size,
        ).next_epoch_itr(shuffle=False)

    @property
    def source_dictionary(self):
        return self.dictionary

    @property
    def target_dictionary(self):
        return self.dictionary


class TruncatedBPTTDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: List[torch.Tensor],  # ordered list of items
        bsz_per_shard,  # number of items processed per GPUs per forward
        shard_id,  # current GPU ID
        num_shards,  # number of GPUs
    ):
        super().__init__()
        self.data = data

        def batchify(data, bsz):
            # Work out how cleanly we can divide the dataset into bsz parts.
            nbatch = data.size(0) // bsz
            # Trim off any extra elements that wouldn't cleanly fit (remainders).
            data = data.narrow(0, 0, nbatch * bsz)
            # Evenly divide the data across the bsz batches.
            data = data.view(bsz, -1).contiguous()
            return data

        # total number of sequences processed by all GPUs in each forward pass
        global_batch_size = bsz_per_shard * num_shards

        """
        With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
        *indices* might look like:

            indices = [[0, 1],
                       [2, 3],
                       [4, 5],
                       [6, 7],
                       [8, 9],
                       [10, 11]]

        The size of the TruncatedBPTTDataset instance will be 2,
        and shard 1 will see items:

            [(0, [data[4], data[6]]),
             (1, [data[5], data[7]])]
        """
        indices = batchify(torch.arange(len(data)), global_batch_size)
        assert indices.size(0) == global_batch_size

        self.my_indices = indices[
            shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
        ]
        assert self.my_indices.size(0) == bsz_per_shard

    def __len__(self):
        return self.my_indices.size(1)

    def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
        return (i, [self.data[idx] for idx in self.my_indices[:, i]])