Spaces:

OFA-Sys
/

OFA-OCR

Runtime error

App Files Files Community

OFA-OCR / fairseq /examples /truncated_bptt /truncated_bptt_lm_task.py

JustinLin610

first commit

ee21b96 over 1 year ago

raw

history blame

No virus

9.9 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import logging
	import os
	from dataclasses import dataclass, field
	from typing import List, Optional, Tuple

	import torch
	from fairseq import utils
	from fairseq.data import (
	Dictionary,
	TokenBlockDataset,
	data_utils,
	iterators,
	)
	from fairseq.dataclass import FairseqDataclass
	from fairseq.distributed import utils as dist_utils
	from fairseq.tasks import FairseqTask, register_task
	from omegaconf import II


	logger = logging.getLogger(__name__)


	@dataclass
	class TruncatedBPTTLMConfig(FairseqDataclass):
	data: str = field(default="???", metadata={"help": "path to data directory"})
	tokens_per_sample: int = field(
	default=1024,
	metadata={"help": "max number of tokens per sequence"},
	)
	batch_size: int = II("dataset.batch_size")
	# Some models use max_target_positions to know how many positional
	# embeddings to learn. We use II(...) to make it default to
	# tokens_per_sample, but in principle there could be more positional
	# embeddings than tokens in a single batch. This may also be irrelevant for
	# custom model implementations.
	max_target_positions: int = II("task.tokens_per_sample")
	# these will be populated automatically if not provided
	data_parallel_rank: Optional[int] = None
	data_parallel_size: Optional[int] = None


	@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
	class TruncatedBPTTLMTask(FairseqTask):
	def __init__(self, cfg: TruncatedBPTTLMConfig):
	super().__init__(cfg)

	if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
	if torch.distributed.is_initialized():
	cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
	cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
	else:
	cfg.data_parallel_rank = 0
	cfg.data_parallel_size = 1

	# load the dictionary
	paths = utils.split_paths(cfg.data)
	assert len(paths) > 0
	self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
	logger.info("dictionary: {} types".format(len(self.dictionary)))

	def load_dataset(self, split, epoch=1, combine=False, **kwargs):
	"""Load a given dataset split (e.g., train, valid, test)"""

	# support sharded datasets
	paths = utils.split_paths(self.cfg.data)
	assert len(paths) > 0
	data_path = paths[(epoch - 1) % len(paths)]
	split_path = os.path.join(data_path, split)

	# each element of data will be a tensorized line from the original
	# text dataset, similar to ``open(split_path).readlines()``
	data = data_utils.load_indexed_dataset(
	split_path, self.dictionary, combine=combine
	)
	if data is None:
	raise FileNotFoundError(
	"Dataset not found: {} ({})".format(split, split_path)
	)

	# this is similar to ``data.view(-1).split(tokens_per_sample)``
	data = TokenBlockDataset(
	data,
	data.sizes,
	block_size=self.cfg.tokens_per_sample,
	pad=None, # unused
	eos=None, # unused
	break_mode="none",
	)

	self.datasets[split] = TruncatedBPTTDataset(
	data=data,
	bsz_per_shard=self.cfg.batch_size,
	shard_id=self.cfg.data_parallel_rank,
	num_shards=self.cfg.data_parallel_size,
	)

	def dataset(self, split):
	return self.datasets[split]

	def get_batch_iterator(
	self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs
	):
	return iterators.EpochBatchIterator(
	dataset=dataset,
	collate_fn=self._collate_fn,
	num_workers=num_workers,
	epoch=epoch,
	buffer_size=data_buffer_size,
	# we don't use the batching functionality from EpochBatchIterator;
	# instead every item in dataset is a whole batch
	batch_sampler=[[i] for i in range(len(dataset))],
	disable_shuffling=True,
	)

	def _collate_fn(self, items: List[List[torch.Tensor]]):
	# we don't use fairseq's batching functionality, so we expect a single
	# Tensor of type List[torch.Tensor]
	assert len(items) == 1

	# item will have shape B x T (the last batch may have length < T)
	id, item = items[0]
	item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
	B, T = item.size()

	# shift item one position over and append a padding token for the target
	target = torch.nn.functional.pad(
	item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
	)

	# fairseq expects batches to have the following structure
	return {
	"id": torch.tensor([id]*item.size(0)),
	"net_input": {
	"src_tokens": item,
	},
	"target": target,
	"nsentences": item.size(0),
	"ntokens": item.numel(),
	}

	def build_dataset_for_inference(
	self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
	) -> torch.utils.data.Dataset:
	eos = self.source_dictionary.eos()
	dataset = TokenBlockDataset(
	src_tokens,
	src_lengths,
	block_size=None, # ignored for "eos" break mode
	pad=self.source_dictionary.pad(),
	eos=eos,
	break_mode="eos",
	)

	class Dataset(torch.utils.data.Dataset):
	def __getitem__(self, i):
	item = dataset[i]
	if item[-1] == eos:
	# remove eos to support generating with a prefix
	item = item[:-1]
	return (i, [item])

	def __len__(self):
	return len(dataset)

	return Dataset()

	def inference_step(
	self, generator, models, sample, prefix_tokens=None, constraints=None
	):
	with torch.no_grad():
	if constraints is not None:
	raise NotImplementedError

	# SequenceGenerator doesn't use src_tokens directly, we need to
	# pass the prefix_tokens argument instead.
	if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
	prefix_tokens = sample["net_input"]["src_tokens"]

	# begin generation with the end-of-sentence token
	bos_token = self.source_dictionary.eos()

	return generator.generate(
	models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
	)

	def eval_lm_dataloader(
	self,
	dataset,
	max_tokens: Optional[int] = 36000,
	batch_size: Optional[int] = None,
	max_positions: Optional[int] = None,
	num_shards: int = 1,
	shard_id: int = 0,
	num_workers: int = 1,
	data_buffer_size: int = 10,
	context_window: int = 0,
	):
	if context_window > 0:
	raise NotImplementedError(
	"Transformer-XL doesn't need --context-window, try "
	"--model-overrides '{\"mem_len\":42}' instead "
	)
	return self.get_batch_iterator(
	dataset=dataset,
	max_tokens=max_tokens,
	max_sentences=batch_size,
	max_positions=max_positions,
	ignore_invalid_inputs=True,
	num_shards=num_shards,
	shard_id=shard_id,
	num_workers=num_workers,
	data_buffer_size=data_buffer_size,
	).next_epoch_itr(shuffle=False)

	@property
	def source_dictionary(self):
	return self.dictionary

	@property
	def target_dictionary(self):
	return self.dictionary


	class TruncatedBPTTDataset(torch.utils.data.Dataset):
	def __init__(
	self,
	data: List[torch.Tensor], # ordered list of items
	bsz_per_shard, # number of items processed per GPUs per forward
	shard_id, # current GPU ID
	num_shards, # number of GPUs
	):
	super().__init__()
	self.data = data

	def batchify(data, bsz):
	# Work out how cleanly we can divide the dataset into bsz parts.
	nbatch = data.size(0) // bsz
	# Trim off any extra elements that wouldn't cleanly fit (remainders).
	data = data.narrow(0, 0, nbatch * bsz)
	# Evenly divide the data across the bsz batches.
	data = data.view(bsz, -1).contiguous()
	return data

	# total number of sequences processed by all GPUs in each forward pass
	global_batch_size = bsz_per_shard * num_shards

	"""
	With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
	indices might look like:

	indices = [[0, 1],
	[2, 3],
	[4, 5],
	[6, 7],
	[8, 9],
	[10, 11]]

	The size of the TruncatedBPTTDataset instance will be 2,
	and shard 1 will see items:

	[(0, [data[4], data[6]]),
	(1, [data[5], data[7]])]
	"""
	indices = batchify(torch.arange(len(data)), global_batch_size)
	assert indices.size(0) == global_batch_size

	self.my_indices = indices[
	shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
	]
	assert self.my_indices.size(0) == bsz_per_shard

	def __len__(self):
	return self.my_indices.size(1)

	def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
	return (i, [self.data[idx] for idx in self.my_indices[:, i]])