mfrashad's picture
Init code
8f87579
'''
A sampler is just a list of integer listing the indexes of the
inputs in a data set to sample. For reproducibility, the
FixedRandomSubsetSampler uses a seeded prng to produce the same
sequence always. FixedSubsetSampler is just a wrapper for an
explicit list of integers.
coordinate_sample solves another sampling problem: when testing
convolutional outputs, we can reduce data explosing by sampling
random points of the feature map rather than the entire feature map.
coordinate_sample does this in a deterministic way that is also
resolution-independent.
'''
import numpy
import random
from torch.utils.data.sampler import Sampler
class FixedSubsetSampler(Sampler):
"""Represents a fixed sequence of data set indices.
Subsets can be created by specifying a subset of output indexes.
"""
def __init__(self, samples):
self.samples = samples
def __iter__(self):
return iter(self.samples)
def __len__(self):
return len(self.samples)
def __getitem__(self, key):
return self.samples[key]
def subset(self, new_subset):
return FixedSubsetSampler(self.dereference(new_subset))
def dereference(self, indices):
'''
Translate output sample indices (small numbers indexing the sample)
to input sample indices (larger number indexing the original full set)
'''
return [self.samples[i] for i in indices]
class FixedRandomSubsetSampler(FixedSubsetSampler):
"""Samples a fixed number of samples from the dataset, deterministically.
Arguments:
data_source,
sample_size,
seed (optional)
"""
def __init__(self, data_source, start=None, end=None, seed=1):
rng = random.Random(seed)
shuffled = list(range(len(data_source)))
rng.shuffle(shuffled)
self.data_source = data_source
super(FixedRandomSubsetSampler, self).__init__(shuffled[start:end])
def class_subset(self, class_filter):
'''
Returns only the subset matching the given rule.
'''
if isinstance(class_filter, int):
rule = lambda d: d[1] == class_filter
else:
rule = class_filter
return self.subset([i for i, j in enumerate(self.samples)
if rule(self.data_source[j])])
def coordinate_sample(shape, sample_size, seeds, grid=13, seed=1, flat=False):
'''
Returns a (end-start) sets of sample_size grid points within
the shape given. If the shape dimensions are a multiple of 'grid',
then sampled points within the same row will never be duplicated.
'''
if flat:
sampind = numpy.zeros((len(seeds), sample_size), dtype=int)
else:
sampind = numpy.zeros((len(seeds), 2, sample_size), dtype=int)
assert sample_size <= grid
for j, seed in enumerate(seeds):
rng = numpy.random.RandomState(seed)
# Shuffle the 169 random grid squares, and pick :sample_size.
square_count = grid ** len(shape)
square = numpy.stack(numpy.unravel_index(
rng.choice(square_count, square_count)[:sample_size],
(grid,) * len(shape)))
# Then add a random offset to each x, y and put in the range [0...1)
# Notice this selects the same locations regardless of resolution.
uniform = (square + rng.uniform(size=square.shape)) / grid
# TODO: support affine scaling so that we can align receptive field
# centers exactly when sampling neurons in different layers.
coords = (uniform * numpy.array(shape)[:,None]).astype(int)
# Now take sample_size without replacement. We do this in a way
# such that if sample_size is decreased or increased up to 'grid',
# the selected points become a subset, not totally different points.
if flat:
sampind[j] = numpy.ravel_multi_index(coords, dims=shape)
else:
sampind[j] = coords
return sampind
if __name__ == '__main__':
from numpy.testing import assert_almost_equal
# Test that coordinate_sample is deterministic, in-range, and scalable.
assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102)),
[[[14, 0, 12, 11, 8, 13, 11, 20, 7, 20],
[ 9, 22, 7, 11, 23, 18, 21, 15, 2, 5]]])
assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 102)),
[[[ 7, 0, 6, 5, 4, 6, 5, 10, 3, 20 // 2],
[ 4, 11, 3, 5, 11, 9, 10, 7, 1, 5 // 2]]])
assert_almost_equal(coordinate_sample((13, 13), 10, range(100, 102),
flat=True),
[[ 8, 24, 67, 103, 87, 79, 138, 94, 98, 53],
[ 95, 11, 81, 70, 63, 87, 75, 137, 40, 2+10*13]])
assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 103),
flat=True),
[[ 95, 11, 81, 70, 63, 87, 75, 137, 40, 132],
[ 0, 78, 114, 111, 66, 45, 72, 73, 79, 135]])
assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102),
flat=True),
[[373, 22, 319, 297, 231, 356, 307, 535, 184, 5+20*26]])
# Test FixedRandomSubsetSampler
fss = FixedRandomSubsetSampler(range(10))
assert len(fss) == 10
assert_almost_equal(list(fss), [8, 0, 3, 4, 5, 2, 9, 6, 7, 1])
fss = FixedRandomSubsetSampler(range(10), 3, 8)
assert len(fss) == 5
assert_almost_equal(list(fss), [4, 5, 2, 9, 6])
fss = FixedRandomSubsetSampler([(i, i % 3) for i in range(10)],
class_filter=1)
assert len(fss) == 3
assert_almost_equal(list(fss), [4, 7, 1])