medical
File size: 3,625 Bytes
5ceacbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os.path as op
from zipfile import ZipFile, BadZipFile
import torch.utils.data as data
from PIL import Image
from io import BytesIO
import multiprocessing

_VALID_IMAGE_TYPES = ['.jpg', '.jpeg', '.tiff', '.bmp', '.png']


class ZipData(data.Dataset):
    _IGNORE_ATTRS = {'_zip_file'}

    def __init__(self, path, map_file,
                 transform=None, target_transform=None,
                 extensions=None):
        self._path = path
        if not extensions:
            extensions = _VALID_IMAGE_TYPES
        self._zip_file = ZipFile(path)
        self.zip_dict = {}
        self.samples = []
        self.transform = transform
        self.target_transform = target_transform
        self.class_to_idx = {}
        with open(map_file, 'r') as f:
            for line in iter(f.readline, ""):
                line = line.strip()
                if not line:
                    continue
                cls_idx = [l for l in line.split('\t') if l]
                if not cls_idx:
                    continue
                if (len(cls_idx) < 2):
                    cls_idx = [l for l in line.split(' ') if l]
                    if not cls_idx:
                        continue
                assert len(cls_idx) >= 2, "invalid line: {}".format(line)
                idx = int(cls_idx[1])
                cls = cls_idx[0]
                del cls_idx
                at_idx = cls.find('@')
                assert at_idx >= 0, "invalid class: {}".format(cls)
                cls = cls[at_idx + 1:]
                if cls.startswith('/'):
                    # Python ZipFile expects no root
                    cls = cls[1:]
                assert cls, "invalid class in line {}".format(line)
                prev_idx = self.class_to_idx.get(cls)
                assert prev_idx is None or prev_idx == idx, "class: {} idx: {} previously had idx: {}".format(
                    cls, idx, prev_idx
                )
                self.class_to_idx[cls] = idx

        for fst in self._zip_file.infolist():
            fname = fst.filename
            target = self.class_to_idx.get(fname)
            if target is None:
                continue
            if fname.endswith('/') or fname.startswith('.') or fst.file_size == 0:
                continue
            ext = op.splitext(fname)[1].lower()
            if ext in extensions:
                self.samples.append((fname, target))
        assert len(self), "No images found in: {} with map: {}".format(self._path, map_file)

    def __repr__(self):
        return 'ZipData({}, size={})'.format(self._path, len(self))

    def __getstate__(self):
        return {
            key: val if key not in self._IGNORE_ATTRS else None
            for key, val in self.__dict__.iteritems()
        }

    def __getitem__(self, index):
        proc = multiprocessing.current_process()
        pid = proc.pid # get pid of this process.
        if pid not in self.zip_dict:
            self.zip_dict[pid] = ZipFile(self._path)
        zip_file = self.zip_dict[pid]

        if index >= len(self) or index < 0:
            raise KeyError("{} is invalid".format(index))
        path, target = self.samples[index]
        try:
            sample = Image.open(BytesIO(zip_file.read(path))).convert('RGB')
        except BadZipFile:
            print("bad zip file")
            return None, None
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)
        return sample, target

    def __len__(self):
        return len(self.samples)