Mountchicken's picture
Upload 704 files
9bf4bd7
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os
import os.path as osp
import shutil
from typing import List, Optional, Union
from mmocr.registry import (CFG_GENERATORS, DATA_DUMPERS, DATA_GATHERERS,
DATA_OBTAINERS, DATA_PACKERS, DATA_PARSERS)
from mmocr.utils.typing_utils import ConfigType, OptConfigType
class DatasetPreparer:
"""Base class of dataset preparer.
Dataset preparer is used to prepare dataset for MMOCR. It mainly consists
of three steps:
1. For each split:
- Obtain the dataset
- Download
- Extract
- Move/Rename
- Gather the dataset
- Parse the dataset
- Pack the dataset to MMOCR format
- Dump the dataset
2. Delete useless files
3. Generate the base config for this dataset
After all these steps, the original datasets have been prepared for
usage in MMOCR. Check out the dataset format used in MMOCR here:
https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html
Args:
data_root (str): Root directory of data.
dataset_name (str): Dataset name.
task (str): Task type. Options are 'textdet', 'textrecog',
'textspotter', and 'kie'. Defaults to 'textdet'.
nproc (int): Number of parallel processes. Defaults to 4.
train_preparer (OptConfigType): cfg for train data prepare. It contains
the following keys:
- obtainer: cfg for data obtainer.
- gatherer: cfg for data gatherer.
- parser: cfg for data parser.
- packer: cfg for data packer.
- dumper: cfg for data dumper.
Defaults to None.
test_preparer (OptConfigType): cfg for test data prepare. Defaults to
None.
val_preparer (OptConfigType): cfg for val data prepare. Defaults to
None.
config_generator (OptConfigType): cfg for config generator. Defaults to
None.
delete (list[str], optional): List of files to be deleted.
Defaults to None.
"""
def __init__(self,
data_root: str,
dataset_name: str = '',
task: str = 'textdet',
nproc: int = 4,
train_preparer: OptConfigType = None,
test_preparer: OptConfigType = None,
val_preparer: OptConfigType = None,
config_generator: OptConfigType = None,
delete: Optional[List[str]] = None) -> None:
self.data_root = data_root
self.nproc = nproc
self.task = task
self.dataset_name = dataset_name
self.train_preparer = train_preparer
self.test_preparer = test_preparer
self.val_preparer = val_preparer
self.config_generator = config_generator
self.delete = delete
def run(self, splits: Union[str, List] = ['train', 'test', 'val']) -> None:
"""Prepare the dataset."""
if isinstance(splits, str):
splits = [splits]
assert set(splits).issubset(set(['train', 'test',
'val'])), 'Invalid split name'
for split in splits:
self.loop(split, getattr(self, f'{split}_preparer'))
self.clean()
self.generate_config()
@classmethod
def from_file(cls, cfg: ConfigType) -> 'DatasetPreparer':
"""Create a DataPreparer from config file.
Args:
cfg (ConfigType): A config used for building runner. Keys of
``cfg`` can see :meth:`__init__`.
Returns:
Runner: A DatasetPreparer build from ``cfg``.
"""
cfg = copy.deepcopy(cfg)
data_preparer = cls(
data_root=cfg['data_root'],
dataset_name=cfg.get('dataset_name', ''),
task=cfg.get('task', 'textdet'),
nproc=cfg.get('nproc', 4),
train_preparer=cfg.get('train_preparer', None),
test_preparer=cfg.get('test_preparer', None),
val_preparer=cfg.get('val_preparer', None),
delete=cfg.get('delete', None),
config_generator=cfg.get('config_generator', None))
return data_preparer
def loop(self, split: str, cfg: ConfigType) -> None:
"""Loop over the dataset.
Args:
split (str): The split of the dataset.
cfg (ConfigType): A config used for building obtainer, gatherer,
parser, packer and dumper.
"""
if cfg is None:
return
# build obtainer and run
obtainer = cfg.get('obtainer', None)
if obtainer:
print(f'Obtaining {split} Dataset...')
obtainer.setdefault('task', default=self.task)
obtainer.setdefault('data_root', default=self.data_root)
obtainer = DATA_OBTAINERS.build(obtainer)
obtainer()
# build gatherer
gatherer = cfg.get('gatherer', None)
parser = cfg.get('parser', None)
packer = cfg.get('packer', None)
dumper = cfg.get('dumper', None)
related = [gatherer, parser, packer, dumper]
if all(item is None for item in related): # no data process
return
if not all(item is not None for item in related):
raise ValueError('gatherer, parser, packer and dumper should be '
'either all None or not None')
print(f'Gathering {split} Dataset...')
gatherer.setdefault('split', default=split)
gatherer.setdefault('data_root', default=self.data_root)
gatherer.setdefault('ann_dir', default='annotations')
gatherer.setdefault(
'img_dir', default=osp.join(f'{self.task}_imgs', split))
gatherer = DATA_GATHERERS.build(gatherer)
img_paths, ann_paths = gatherer()
# build parser
print(f'Parsing {split} Images and Annotations...')
parser.setdefault('split', default=split)
parser.setdefault('nproc', default=self.nproc)
parser = DATA_PARSERS.build(parser)
# Convert dataset annotations to MMOCR format
samples = parser(img_paths, ann_paths)
# build packer
print(f'Packing {split} Annotations...')
packer.setdefault('split', default=split)
packer.setdefault('nproc', default=self.nproc)
packer.setdefault('data_root', default=self.data_root)
packer = DATA_PACKERS.build(packer)
samples = packer(samples)
# build dumper
print(f'Dumping {split} Annotations...')
# Dump annotation files
dumper.setdefault('task', default=self.task)
dumper.setdefault('split', default=split)
dumper.setdefault('data_root', default=self.data_root)
dumper = DATA_DUMPERS.build(dumper)
dumper(samples)
def generate_config(self):
if self.config_generator is None:
return
self.config_generator.setdefault(
'dataset_name', default=self.dataset_name)
self.config_generator.setdefault('data_root', default=self.data_root)
config_generator = CFG_GENERATORS.build(self.config_generator)
print('Generating base configs...')
config_generator()
def clean(self) -> None:
if self.delete is None:
return
for d in self.delete:
delete_file = osp.join(self.data_root, d)
if osp.exists(delete_file):
if osp.isdir(delete_file):
shutil.rmtree(delete_file)
else:
os.remove(delete_file)