File size: 7,680 Bytes
9bf4bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os
import os.path as osp
import shutil
from typing import List, Optional, Union

from mmocr.registry import (CFG_GENERATORS, DATA_DUMPERS, DATA_GATHERERS,
                            DATA_OBTAINERS, DATA_PACKERS, DATA_PARSERS)
from mmocr.utils.typing_utils import ConfigType, OptConfigType


class DatasetPreparer:
    """Base class of dataset preparer.

    Dataset preparer is used to prepare dataset for MMOCR. It mainly consists
    of three steps:
      1. For each split:
        - Obtain the dataset
            - Download
            - Extract
            - Move/Rename
        - Gather the dataset
        - Parse the dataset
        - Pack the dataset to MMOCR format
        - Dump the dataset
      2. Delete useless files
      3. Generate the base config for this dataset

    After all these steps, the original datasets have been prepared for
    usage in MMOCR. Check out the dataset format used in MMOCR here:
    https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html

    Args:
        data_root (str): Root directory of data.
        dataset_name (str): Dataset name.
        task (str): Task type. Options are 'textdet', 'textrecog',
            'textspotter', and 'kie'. Defaults to 'textdet'.
        nproc (int): Number of parallel processes. Defaults to 4.
        train_preparer (OptConfigType): cfg for train data prepare. It contains
            the following keys:
            - obtainer: cfg for data obtainer.
            - gatherer: cfg for data gatherer.
            - parser: cfg for data parser.
            - packer: cfg for data packer.
            - dumper: cfg for data dumper.
            Defaults to None.
        test_preparer (OptConfigType): cfg for test data prepare. Defaults to
            None.
        val_preparer (OptConfigType): cfg for val data prepare. Defaults to
            None.
        config_generator (OptConfigType): cfg for config generator. Defaults to
            None.
        delete (list[str], optional): List of files to be deleted.
            Defaults to None.
    """

    def __init__(self,
                 data_root: str,
                 dataset_name: str = '',
                 task: str = 'textdet',
                 nproc: int = 4,
                 train_preparer: OptConfigType = None,
                 test_preparer: OptConfigType = None,
                 val_preparer: OptConfigType = None,
                 config_generator: OptConfigType = None,
                 delete: Optional[List[str]] = None) -> None:
        self.data_root = data_root
        self.nproc = nproc
        self.task = task
        self.dataset_name = dataset_name
        self.train_preparer = train_preparer
        self.test_preparer = test_preparer
        self.val_preparer = val_preparer
        self.config_generator = config_generator
        self.delete = delete

    def run(self, splits: Union[str, List] = ['train', 'test', 'val']) -> None:
        """Prepare the dataset."""
        if isinstance(splits, str):
            splits = [splits]
        assert set(splits).issubset(set(['train', 'test',
                                         'val'])), 'Invalid split name'
        for split in splits:
            self.loop(split, getattr(self, f'{split}_preparer'))
        self.clean()
        self.generate_config()

    @classmethod
    def from_file(cls, cfg: ConfigType) -> 'DatasetPreparer':
        """Create a DataPreparer from config file.

        Args:
            cfg (ConfigType): A config used for building runner. Keys of
                ``cfg`` can see :meth:`__init__`.

        Returns:
            Runner: A DatasetPreparer build from ``cfg``.
        """

        cfg = copy.deepcopy(cfg)
        data_preparer = cls(
            data_root=cfg['data_root'],
            dataset_name=cfg.get('dataset_name', ''),
            task=cfg.get('task', 'textdet'),
            nproc=cfg.get('nproc', 4),
            train_preparer=cfg.get('train_preparer', None),
            test_preparer=cfg.get('test_preparer', None),
            val_preparer=cfg.get('val_preparer', None),
            delete=cfg.get('delete', None),
            config_generator=cfg.get('config_generator', None))
        return data_preparer

    def loop(self, split: str, cfg: ConfigType) -> None:
        """Loop over the dataset.

        Args:
            split (str): The split of the dataset.
            cfg (ConfigType): A config used for building obtainer, gatherer,
                parser, packer and dumper.
        """
        if cfg is None:
            return

        # build obtainer and run
        obtainer = cfg.get('obtainer', None)
        if obtainer:
            print(f'Obtaining {split} Dataset...')
            obtainer.setdefault('task', default=self.task)
            obtainer.setdefault('data_root', default=self.data_root)
            obtainer = DATA_OBTAINERS.build(obtainer)
            obtainer()

        # build gatherer
        gatherer = cfg.get('gatherer', None)
        parser = cfg.get('parser', None)
        packer = cfg.get('packer', None)
        dumper = cfg.get('dumper', None)
        related = [gatherer, parser, packer, dumper]
        if all(item is None for item in related):  # no data process
            return
        if not all(item is not None for item in related):
            raise ValueError('gatherer, parser, packer and dumper should be '
                             'either all None or not None')

        print(f'Gathering {split} Dataset...')
        gatherer.setdefault('split', default=split)
        gatherer.setdefault('data_root', default=self.data_root)
        gatherer.setdefault('ann_dir', default='annotations')
        gatherer.setdefault(
            'img_dir', default=osp.join(f'{self.task}_imgs', split))

        gatherer = DATA_GATHERERS.build(gatherer)
        img_paths, ann_paths = gatherer()

        # build parser
        print(f'Parsing {split} Images and Annotations...')
        parser.setdefault('split', default=split)
        parser.setdefault('nproc', default=self.nproc)
        parser = DATA_PARSERS.build(parser)
        # Convert dataset annotations to MMOCR format
        samples = parser(img_paths, ann_paths)

        # build packer
        print(f'Packing {split} Annotations...')
        packer.setdefault('split', default=split)
        packer.setdefault('nproc', default=self.nproc)
        packer.setdefault('data_root', default=self.data_root)
        packer = DATA_PACKERS.build(packer)
        samples = packer(samples)

        # build dumper
        print(f'Dumping {split} Annotations...')
        # Dump annotation files
        dumper.setdefault('task', default=self.task)
        dumper.setdefault('split', default=split)
        dumper.setdefault('data_root', default=self.data_root)
        dumper = DATA_DUMPERS.build(dumper)
        dumper(samples)

    def generate_config(self):
        if self.config_generator is None:
            return
        self.config_generator.setdefault(
            'dataset_name', default=self.dataset_name)
        self.config_generator.setdefault('data_root', default=self.data_root)
        config_generator = CFG_GENERATORS.build(self.config_generator)
        print('Generating base configs...')
        config_generator()

    def clean(self) -> None:
        if self.delete is None:
            return
        for d in self.delete:
            delete_file = osp.join(self.data_root, d)
            if osp.exists(delete_file):
                if osp.isdir(delete_file):
                    shutil.rmtree(delete_file)
                else:
                    os.remove(delete_file)