File size: 1,886 Bytes
b84549f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import enum
from ..data_aug import one_d_image_test_aug, one_d_image_train_aug
from ..ab_dataset import ABDataset
from ..dataset_split import train_val_split
from torchvision.datasets import EMNIST as RawEMNIST
import string
import numpy as np
from typing import Dict, List, Optional
from torchvision.transforms import Compose

from ..registery import dataset_register


@dataset_register(
    name='EMNIST', 
    classes=list(string.digits + string.ascii_letters), 
    class_aliases=[],
    task_type='Image Classification',
    object_type='Digit and Letter',
    shift_type=None
)
class EMNIST(ABDataset):    
    def create_dataset(self, root_dir: str, split: str, transform: Optional[Compose], 
                       classes: List[str], ignore_classes: List[str], idx_map: Optional[Dict[int, int]]):
        if transform is None:
            transform = one_d_image_train_aug() if split == 'train' else one_d_image_test_aug()
            self.transform = transform
        dataset = RawEMNIST(root_dir, 'byclass', train=split != 'test', transform=transform, download=True)
        
        dataset.targets = np.asarray(dataset.targets)

        if len(ignore_classes) > 0: 
            for ignore_class in ignore_classes:
                dataset.data = dataset.data[dataset.targets != classes.index(ignore_class)]
                dataset.targets = dataset.targets[dataset.targets != classes.index(ignore_class)]
        
        if idx_map is not None:
            # note: the code below seems correct but has bug!
            # for old_idx, new_idx in idx_map.items():
            #     dataset.targets[dataset.targets == old_idx] = new_idx
                
            for ti, t in enumerate(dataset.targets):
                dataset.targets[ti] = idx_map[t]
        
        if split != 'test':
            dataset = train_val_split(dataset, split)    
        return dataset