Mountchicken's picture
Upload 704 files
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import re
from typing import List, Optional, Tuple
from mmocr.registry import DATA_GATHERERS
from mmocr.utils import list_files
from .base import BaseGatherer
class PairGatherer(BaseGatherer):
"""Gather the dataset files. Specifically for the paired annotations. That
is to say, each image has a corresponding annotation file. For example,
img_1.jpg <---> gt_img_1.txt
img_2.jpg <---> gt_img_2.txt
img_3.jpg <---> gt_img_3.txt
img_suffixes (List[str]): File suffixes that used for searching.
rule (Sequence): The rule for pairing the files. The first element is
the matching pattern for the file, and the second element is the
replacement pattern, which should be a regular expression. For
example, to map the image name img_1.jpg to the annotation name
gt_img_1.txt, the rule is
[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501
Note: PairGatherer assumes that each split annotation file is in the
correspond split directory. For example, all the train annotation files are
in {ann_dir}/train.
def __init__(self,
img_suffixes: Optional[List[str]] = None,
rule: Optional[List[str]] = None,
**kwargs) -> None:
self.rule = rule
self.img_suffixes = img_suffixes
# ann_dir = {ann_root}/{ann_dir}/{split}
self.ann_dir = osp.join(self.ann_dir, self.split)
def __call__(self) -> Tuple[List[str], List[str]]:
"""tuple(list, list): The list of image paths and the list of
annotation paths."""
img_list = list()
ann_list = list()
for img_path in list_files(self.img_dir, self.img_suffixes):
if not re.match(self.rule[0], osp.basename(img_path)):
ann_name = re.sub(self.rule[0], self.rule[1],
ann_path = osp.join(self.ann_dir, ann_name)
return img_list, ann_list