File size: 5,587 Bytes
e0c7c25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
A sampler is just a list of integer listing the indexes of the
inputs in a data set to sample.  For reproducibility, the
FixedRandomSubsetSampler uses a seeded prng to produce the same
sequence always.  FixedSubsetSampler is just a wrapper for an
explicit list of integers.

coordinate_sample solves another sampling problem: when testing
convolutional outputs, we can reduce data explosing by sampling
random points of the feature map rather than the entire feature map.
coordinate_sample does this in a deterministic way that is also
resolution-independent.
'''

import numpy
import random
from torch.utils.data.sampler import Sampler

class FixedSubsetSampler(Sampler):
    """Represents a fixed sequence of data set indices.
    Subsets can be created by specifying a subset of output indexes.
    """
    def __init__(self, samples):
        self.samples = samples

    def __iter__(self):
        return iter(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, key):
        return self.samples[key]

    def subset(self, new_subset):
        return FixedSubsetSampler(self.dereference(new_subset))

    def dereference(self, indices):
        '''
        Translate output sample indices (small numbers indexing the sample)
        to input sample indices (larger number indexing the original full set)
        '''
        return [self.samples[i] for i in indices]


class FixedRandomSubsetSampler(FixedSubsetSampler):
    """Samples a fixed number of samples from the dataset, deterministically.
    Arguments:
        data_source,
        sample_size,
        seed (optional)
    """
    def __init__(self, data_source, start=None, end=None, seed=1):
        rng = random.Random(seed)
        shuffled = list(range(len(data_source)))
        rng.shuffle(shuffled)
        self.data_source = data_source
        super(FixedRandomSubsetSampler, self).__init__(shuffled[start:end])

    def class_subset(self, class_filter):
        '''
        Returns only the subset matching the given rule.
        '''
        if isinstance(class_filter, int):
            rule = lambda d: d[1] == class_filter
        else:
            rule = class_filter
        return self.subset([i for i, j in enumerate(self.samples)
                if rule(self.data_source[j])])

def coordinate_sample(shape, sample_size, seeds, grid=13, seed=1, flat=False):
    '''
    Returns a (end-start) sets of sample_size grid points within
    the shape given.  If the shape dimensions are a multiple of 'grid',
    then sampled points within the same row will never be duplicated.
    '''
    if flat:
        sampind = numpy.zeros((len(seeds), sample_size), dtype=int)
    else:
        sampind = numpy.zeros((len(seeds), 2, sample_size), dtype=int)
    assert sample_size <= grid
    for j, seed in enumerate(seeds):
        rng = numpy.random.RandomState(seed)
        # Shuffle the 169 random grid squares, and pick :sample_size.
        square_count = grid ** len(shape)
        square = numpy.stack(numpy.unravel_index(
            rng.choice(square_count, square_count)[:sample_size],
            (grid,) * len(shape)))
        # Then add a random offset to each x, y and put in the range [0...1)
        # Notice this selects the same locations regardless of resolution.
        uniform = (square + rng.uniform(size=square.shape)) / grid
        # TODO: support affine scaling so that we can align receptive field
        # centers exactly when sampling neurons in different layers.
        coords = (uniform * numpy.array(shape)[:,None]).astype(int)
        # Now take sample_size without replacement.  We do this in a way
        # such that if sample_size is decreased or increased up to 'grid',
        # the selected points become a subset, not totally different points.
        if flat:
            sampind[j] = numpy.ravel_multi_index(coords, dims=shape)
        else:
            sampind[j] = coords
    return sampind

if __name__ == '__main__':
    from numpy.testing import assert_almost_equal
    # Test that coordinate_sample is deterministic, in-range, and scalable.
    assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102)),
            [[[14,  0, 12, 11,  8, 13, 11, 20,  7, 20],
              [ 9, 22,  7, 11, 23, 18, 21, 15,  2,  5]]])
    assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 102)),
            [[[ 7,  0,  6,  5,  4,  6,  5, 10,  3, 20 // 2],
              [ 4, 11,  3,  5, 11,  9, 10,  7,  1,  5 // 2]]])
    assert_almost_equal(coordinate_sample((13, 13), 10, range(100, 102),
        flat=True),
            [[  8,  24,  67, 103,  87,  79, 138,  94,  98,  53],
             [ 95,  11,  81,  70,  63,  87,  75, 137,  40, 2+10*13]])
    assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 103),
        flat=True),
            [[ 95,  11,  81,  70,  63,  87,  75, 137,  40, 132],
             [  0,  78, 114, 111,  66,  45,  72,  73,  79, 135]])
    assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102),
        flat=True),
            [[373,  22, 319, 297, 231, 356, 307, 535, 184, 5+20*26]])
    # Test FixedRandomSubsetSampler
    fss = FixedRandomSubsetSampler(range(10))
    assert len(fss) == 10
    assert_almost_equal(list(fss), [8, 0, 3, 4, 5, 2, 9, 6, 7, 1])
    fss = FixedRandomSubsetSampler(range(10), 3, 8)
    assert len(fss) == 5
    assert_almost_equal(list(fss), [4, 5, 2, 9, 6])
    fss = FixedRandomSubsetSampler([(i, i % 3) for i in range(10)],
            class_filter=1)
    assert len(fss) == 3
    assert_almost_equal(list(fss), [4, 7, 1])