File size: 9,900 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import logging
import os
from dataclasses import dataclass, field
from typing import List, Optional, Tuple

import torch
from fairseq import utils
from fairseq.data import (
    Dictionary,
    TokenBlockDataset,
    data_utils,
    iterators,
)
from fairseq.dataclass import FairseqDataclass
from fairseq.distributed import utils as dist_utils
from fairseq.tasks import FairseqTask, register_task
from omegaconf import II


logger = logging.getLogger(__name__)


@dataclass
class TruncatedBPTTLMConfig(FairseqDataclass):
    data: str = field(default="???", metadata={"help": "path to data directory"})
    tokens_per_sample: int = field(
        default=1024,
        metadata={"help": "max number of tokens per sequence"},
    )
    batch_size: int = II("dataset.batch_size")
    # Some models use *max_target_positions* to know how many positional
    # embeddings to learn. We use II(...) to make it default to
    # *tokens_per_sample*, but in principle there could be more positional
    # embeddings than tokens in a single batch. This may also be irrelevant for
    # custom model implementations.
    max_target_positions: int = II("task.tokens_per_sample")
    # these will be populated automatically if not provided
    data_parallel_rank: Optional[int] = None
    data_parallel_size: Optional[int] = None


@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
class TruncatedBPTTLMTask(FairseqTask):
    def __init__(self, cfg: TruncatedBPTTLMConfig):
        super().__init__(cfg)

        if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
            if torch.distributed.is_initialized():
                cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
                cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
            else:
                cfg.data_parallel_rank = 0
                cfg.data_parallel_size = 1

        # load the dictionary
        paths = utils.split_paths(cfg.data)
        assert len(paths) > 0
        self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
        logger.info("dictionary: {} types".format(len(self.dictionary)))

    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)"""

        # support sharded datasets
        paths = utils.split_paths(self.cfg.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        # each element of *data* will be a tensorized line from the original
        # text dataset, similar to ``open(split_path).readlines()``
        data = data_utils.load_indexed_dataset(
            split_path, self.dictionary, combine=combine
        )
        if data is None:
            raise FileNotFoundError(
                "Dataset not found: {} ({})".format(split, split_path)
            )

        # this is similar to ``data.view(-1).split(tokens_per_sample)``
        data = TokenBlockDataset(
            data,
            data.sizes,
            block_size=self.cfg.tokens_per_sample,
            pad=None,  # unused
            eos=None,  # unused
            break_mode="none",
        )

        self.datasets[split] = TruncatedBPTTDataset(
            data=data,
            bsz_per_shard=self.cfg.batch_size,
            shard_id=self.cfg.data_parallel_rank,
            num_shards=self.cfg.data_parallel_size,
        )

    def dataset(self, split):
        return self.datasets[split]

    def get_batch_iterator(
        self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs
    ):
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=self._collate_fn,
            num_workers=num_workers,
            epoch=epoch,
            buffer_size=data_buffer_size,
            # we don't use the batching functionality from EpochBatchIterator;
            # instead every item in *dataset* is a whole batch
            batch_sampler=[[i] for i in range(len(dataset))],
            disable_shuffling=True,
        )

    def _collate_fn(self, items: List[List[torch.Tensor]]):
        # we don't use fairseq's batching functionality, so we expect a single
        # Tensor of type List[torch.Tensor]
        assert len(items) == 1

        # item will have shape B x T (the last batch may have length < T)
        id, item = items[0]
        item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
        B, T = item.size()

        # shift item one position over and append a padding token for the target
        target = torch.nn.functional.pad(
            item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
        )

        # fairseq expects batches to have the following structure
        return {
            "id": torch.tensor([id]*item.size(0)),
            "net_input": {
                "src_tokens": item,
            },
            "target": target,
            "nsentences": item.size(0),
            "ntokens": item.numel(),
        }

    def build_dataset_for_inference(
        self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
    ) -> torch.utils.data.Dataset:
        eos = self.source_dictionary.eos()
        dataset = TokenBlockDataset(
            src_tokens,
            src_lengths,
            block_size=None,  # ignored for "eos" break mode
            pad=self.source_dictionary.pad(),
            eos=eos,
            break_mode="eos",
        )

        class Dataset(torch.utils.data.Dataset):
            def __getitem__(self, i):
                item = dataset[i]
                if item[-1] == eos:
                    # remove eos to support generating with a prefix
                    item = item[:-1]
                return (i, [item])

            def __len__(self):
                return len(dataset)

        return Dataset()

    def inference_step(
        self, generator, models, sample, prefix_tokens=None, constraints=None
    ):
        with torch.no_grad():
            if constraints is not None:
                raise NotImplementedError

            # SequenceGenerator doesn't use *src_tokens* directly, we need to
            # pass the *prefix_tokens* argument instead.
            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
                prefix_tokens = sample["net_input"]["src_tokens"]

            # begin generation with the end-of-sentence token
            bos_token = self.source_dictionary.eos()

            return generator.generate(
                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
            )

    def eval_lm_dataloader(
        self,
        dataset,
        max_tokens: Optional[int] = 36000,
        batch_size: Optional[int] = None,
        max_positions: Optional[int] = None,
        num_shards: int = 1,
        shard_id: int = 0,
        num_workers: int = 1,
        data_buffer_size: int = 10,
        context_window: int = 0,
    ):
        if context_window > 0:
            raise NotImplementedError(
                "Transformer-XL doesn't need --context-window, try "
                "--model-overrides '{\"mem_len\":42}' instead "
            )
        return self.get_batch_iterator(
            dataset=dataset,
            max_tokens=max_tokens,
            max_sentences=batch_size,
            max_positions=max_positions,
            ignore_invalid_inputs=True,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            data_buffer_size=data_buffer_size,
        ).next_epoch_itr(shuffle=False)

    @property
    def source_dictionary(self):
        return self.dictionary

    @property
    def target_dictionary(self):
        return self.dictionary


class TruncatedBPTTDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: List[torch.Tensor],  # ordered list of items
        bsz_per_shard,  # number of items processed per GPUs per forward
        shard_id,  # current GPU ID
        num_shards,  # number of GPUs
    ):
        super().__init__()
        self.data = data

        def batchify(data, bsz):
            # Work out how cleanly we can divide the dataset into bsz parts.
            nbatch = data.size(0) // bsz
            # Trim off any extra elements that wouldn't cleanly fit (remainders).
            data = data.narrow(0, 0, nbatch * bsz)
            # Evenly divide the data across the bsz batches.
            data = data.view(bsz, -1).contiguous()
            return data

        # total number of sequences processed by all GPUs in each forward pass
        global_batch_size = bsz_per_shard * num_shards

        """
        With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
        *indices* might look like:

            indices = [[0, 1],
                       [2, 3],
                       [4, 5],
                       [6, 7],
                       [8, 9],
                       [10, 11]]

        The size of the TruncatedBPTTDataset instance will be 2,
        and shard 1 will see items:

            [(0, [data[4], data[6]]),
             (1, [data[5], data[7]])]
        """
        indices = batchify(torch.arange(len(data)), global_batch_size)
        assert indices.size(0) == global_batch_size

        self.my_indices = indices[
            shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
        ]
        assert self.my_indices.size(0) == bsz_per_shard

    def __len__(self):
        return self.my_indices.size(1)

    def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
        return (i, [self.data[idx] for idx in self.my_indices[:, i]])