File size: 3,890 Bytes
a00ee36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Store dataset indexes of datapoints selected by k-means algorithm."""
from argparse import ArgumentParser
import numpy as np
import os
import h5py as h5
import faiss
def main(args):
if args["which_dataset"] == "imagenet":
dataset_name_prefix = "ILSVRC"
im_prefix = "IN"
elif args["which_dataset"] == "coco":
dataset_name_prefix = "COCO"
im_prefix = "COCO"
else:
dataset_name_prefix = args["which_dataset"]
im_prefix = args["which_dataset"]
# HDF5 filename
filename = os.path.join(
args["data_root"],
"%s%s_feats_%s_%s.hdf5"
% (
dataset_name_prefix,
args["resolution"],
args["feature_extractor"],
args["backbone_feature_extractor"],
),
)
# Load features
print("Loading features %s..." % (filename))
with h5.File(filename, "r") as f:
features = f["feats"][:]
features = np.array(features)
# Normalize features
features /= np.linalg.norm(features, axis=1, keepdims=True)
feat_dim = 2048
# k-means
print("Training k-means with %i centers..." % (args["kmeans_subsampled"]))
kmeans = faiss.Kmeans(
feat_dim,
args["kmeans_subsampled"],
niter=100,
verbose=True,
gpu=args["gpu"],
min_points_per_centroid=200,
spherical=False,
)
kmeans.train(features.astype(np.float32))
# Find closest instances to each k-means cluster
print("Finding closest instances to centers...")
index = faiss.IndexFlatL2(feat_dim)
index.add(features.astype(np.float32))
D, closest_sample = index.search(kmeans.centroids, 1)
net_str = (
"rn50"
if args["backbone_feature_extractor"]
else args["backbone_feature_extractor"]
)
stored_filename = "%s_res%i_%s_%s_kmeans_k%i" % (
im_prefix,
args["resolution"],
net_str,
args["feature_extractor"],
args["kmeans_subsampled"],
)
np.save(
os.path.join(args["data_root"], stored_filename),
{"center_examples": closest_sample},
)
print(
"Instance indexes resulting from a subsampling based on k-means have been saved in file %s!"
% (stored_filename)
)
if __name__ == "__main__":
parser = ArgumentParser(
description="Storing cluster indexes for k-means-based data subsampling"
)
parser.add_argument(
"--resolution",
type=int,
default=64,
help="Data resolution (default: %(default)s)",
)
parser.add_argument(
"--which_dataset", type=str, default="imagenet", help="Dataset choice."
)
parser.add_argument(
"--data_root",
type=str,
default="data",
help="Default location where data is stored (default: %(default)s)",
)
parser.add_argument(
"--feature_extractor",
type=str,
default="classification",
choices=["classification", "selfsupervised"],
help="Choice of feature extractor",
)
parser.add_argument(
"--backbone_feature_extractor",
type=str,
default="resnet50",
choices=["resnet50"],
help="Choice of feature extractor backbone",
)
parser.add_argument(
"--kmeans_subsampled",
type=int,
default=-1,
help="Number of k-means centers if using subsampled training instances"
" (default: %(default)s)",
)
parser.add_argument(
"--gpu",
action="store_true",
default=False,
help="Use faiss with GPUs (default: %(default)s)",
)
args = vars(parser.parse_args())
main(args)
|