''' A sampler is just a list of integer listing the indexes of the inputs in a data set to sample. For reproducibility, the FixedRandomSubsetSampler uses a seeded prng to produce the same sequence always. FixedSubsetSampler is just a wrapper for an explicit list of integers. coordinate_sample solves another sampling problem: when testing convolutional outputs, we can reduce data explosing by sampling random points of the feature map rather than the entire feature map. coordinate_sample does this in a deterministic way that is also resolution-independent. ''' import numpy import random from torch.utils.data.sampler import Sampler class FixedSubsetSampler(Sampler): """Represents a fixed sequence of data set indices. Subsets can be created by specifying a subset of output indexes. """ def __init__(self, samples): self.samples = samples def __iter__(self): return iter(self.samples) def __len__(self): return len(self.samples) def __getitem__(self, key): return self.samples[key] def subset(self, new_subset): return FixedSubsetSampler(self.dereference(new_subset)) def dereference(self, indices): ''' Translate output sample indices (small numbers indexing the sample) to input sample indices (larger number indexing the original full set) ''' return [self.samples[i] for i in indices] class FixedRandomSubsetSampler(FixedSubsetSampler): """Samples a fixed number of samples from the dataset, deterministically. Arguments: data_source, sample_size, seed (optional) """ def __init__(self, data_source, start=None, end=None, seed=1): rng = random.Random(seed) shuffled = list(range(len(data_source))) rng.shuffle(shuffled) self.data_source = data_source super(FixedRandomSubsetSampler, self).__init__(shuffled[start:end]) def class_subset(self, class_filter): ''' Returns only the subset matching the given rule. ''' if isinstance(class_filter, int): rule = lambda d: d[1] == class_filter else: rule = class_filter return self.subset([i for i, j in enumerate(self.samples) if rule(self.data_source[j])]) def coordinate_sample(shape, sample_size, seeds, grid=13, seed=1, flat=False): ''' Returns a (end-start) sets of sample_size grid points within the shape given. If the shape dimensions are a multiple of 'grid', then sampled points within the same row will never be duplicated. ''' if flat: sampind = numpy.zeros((len(seeds), sample_size), dtype=int) else: sampind = numpy.zeros((len(seeds), 2, sample_size), dtype=int) assert sample_size <= grid for j, seed in enumerate(seeds): rng = numpy.random.RandomState(seed) # Shuffle the 169 random grid squares, and pick :sample_size. square_count = grid ** len(shape) square = numpy.stack(numpy.unravel_index( rng.choice(square_count, square_count)[:sample_size], (grid,) * len(shape))) # Then add a random offset to each x, y and put in the range [0...1) # Notice this selects the same locations regardless of resolution. uniform = (square + rng.uniform(size=square.shape)) / grid # TODO: support affine scaling so that we can align receptive field # centers exactly when sampling neurons in different layers. coords = (uniform * numpy.array(shape)[:,None]).astype(int) # Now take sample_size without replacement. We do this in a way # such that if sample_size is decreased or increased up to 'grid', # the selected points become a subset, not totally different points. if flat: sampind[j] = numpy.ravel_multi_index(coords, dims=shape) else: sampind[j] = coords return sampind def main(): from . import parallelfolder import argparse, os, shutil parser = argparse.ArgumentParser(description='Net dissect utility', prog='python -m %s.sampler' % __package__) parser.add_argument('indir') parser.add_argument('outdir') parser.add_argument('--size', type=int, default=100) parser.add_argument('--test', action='store_true', default=False) args = parser.parse_args() if os.path.exists(args.outdir): print('%s already exists' % args.outdir) sys.exit(1) os.makedirs(args.outdir) dataset = parallelfolder.ParallelImageFolders([args.indir]) sampler = FixedRandomSubsetSampler(dataset, end=args.size) seen_filenames = set() def number_filename(filename, number): if '.' in filename: a, b = filename.rsplit('.', 1) return a + '_%d.' % number + b return filename + '_%d' % number for i in sampler.dereference(range(args.size)): sourcefile = dataset.images[i][0] filename = os.path.basename(sourcefile) template = filename num = 0 while filename in seen_filenames: num += 1 filename = number_filename(template, num) seen_filenames.add(filename) shutil.copy(os.path.join(args.indir, sourcefile), os.path.join(args.outdir, filename)) def test(): from numpy.testing import assert_almost_equal # Test that coordinate_sample is deterministic, in-range, and scalable. assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102)), [[[14, 0, 12, 11, 8, 13, 11, 20, 7, 20], [ 9, 22, 7, 11, 23, 18, 21, 15, 2, 5]]]) assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 102)), [[[ 7, 0, 6, 5, 4, 6, 5, 10, 3, 20 // 2], [ 4, 11, 3, 5, 11, 9, 10, 7, 1, 5 // 2]]]) assert_almost_equal(coordinate_sample((13, 13), 10, range(100, 102), flat=True), [[ 8, 24, 67, 103, 87, 79, 138, 94, 98, 53], [ 95, 11, 81, 70, 63, 87, 75, 137, 40, 2+10*13]]) assert_almost_equal(coordinate_sample((13, 13), 10, range(101, 103), flat=True), [[ 95, 11, 81, 70, 63, 87, 75, 137, 40, 132], [ 0, 78, 114, 111, 66, 45, 72, 73, 79, 135]]) assert_almost_equal(coordinate_sample((26, 26), 10, range(101, 102), flat=True), [[373, 22, 319, 297, 231, 356, 307, 535, 184, 5+20*26]]) # Test FixedRandomSubsetSampler fss = FixedRandomSubsetSampler(range(10)) assert len(fss) == 10 assert_almost_equal(list(fss), [6, 8, 9, 7, 5, 3, 0, 4, 1, 2]) fss = FixedRandomSubsetSampler(range(10), 3, 8) assert len(fss) == 5 assert_almost_equal(list(fss), [7, 5, 3, 0, 4]) fss = FixedRandomSubsetSampler([(i, i % 3) for i in range(10)] ).class_subset(class_filter=1) assert len(fss) == 3 assert_almost_equal(list(fss), [7, 4, 1]) if __name__ == '__main__': import sys if '--test' in sys.argv[1:]: test() else: main()