| | import torch |
| | import numpy as np |
| | import json |
| |
|
| |
|
| | class Base: |
| |
|
| | def __init__(self, data, args, device='cuda', **kwargs): |
| | self.data = data |
| | self.args = args |
| | self.device = device |
| | n = int(data.feat_train.shape[0] * args.reduction_rate) |
| | d = data.feat_train.shape[1] |
| | self.nnodes_syn = n |
| | self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device) |
| |
|
| | def generate_labels_syn(self, data): |
| | from collections import Counter |
| | counter = Counter(data.labels_train) |
| | num_class_dict = {} |
| | n = len(data.labels_train) |
| |
|
| | sorted_counter = sorted(counter.items(), key=lambda x:x[1]) |
| | sum_ = 0 |
| | labels_syn = [] |
| | self.syn_class_indices = {} |
| | for ix, (c, num) in enumerate(sorted_counter): |
| | if ix == len(sorted_counter) - 1: |
| | num_class_dict[c] = int(n * self.args.reduction_rate) - sum_ |
| | self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] |
| | labels_syn += [c] * num_class_dict[c] |
| | else: |
| | num_class_dict[c] = max(int(num * self.args.reduction_rate), 1) |
| | sum_ += num_class_dict[c] |
| | self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] |
| | labels_syn += [c] * num_class_dict[c] |
| |
|
| | self.num_class_dict = num_class_dict |
| | return labels_syn |
| |
|
| | def select(self): |
| | return |
| |
|
| | class KCenter(Base): |
| |
|
| | def __init__(self, data, args, device='cuda', **kwargs): |
| | super(KCenter, self).__init__(data, args, device='cuda', **kwargs) |
| |
|
| | def select(self, embeds, inductive=False): |
| | |
| | |
| | num_class_dict = self.num_class_dict |
| | if inductive: |
| | idx_train = np.arange(len(self.data.idx_train)) |
| | else: |
| | idx_train = self.data.idx_train |
| | labels_train = self.data.labels_train |
| | idx_selected = [] |
| |
|
| | for class_id, cnt in num_class_dict.items(): |
| | idx = idx_train[labels_train==class_id] |
| | feature = embeds[idx] |
| | mean = torch.mean(feature, dim=0, keepdim=True) |
| | |
| | dis = torch.cdist(feature, mean)[:,0] |
| | rank = torch.argsort(dis) |
| | idx_centers = rank[:1].tolist() |
| | for i in range(cnt-1): |
| | feature_centers = feature[idx_centers] |
| | dis_center = torch.cdist(feature, feature_centers) |
| | dis_min, _ = torch.min(dis_center, dim=-1) |
| | id_max = torch.argmax(dis_min).item() |
| | idx_centers.append(id_max) |
| |
|
| | idx_selected.append(idx[idx_centers]) |
| | |
| | return np.hstack(idx_selected) |
| |
|
| |
|
| | class Herding(Base): |
| |
|
| | def __init__(self, data, args, device='cuda', **kwargs): |
| | super(Herding, self).__init__(data, args, device='cuda', **kwargs) |
| |
|
| | def select(self, embeds, inductive=False): |
| | num_class_dict = self.num_class_dict |
| | if inductive: |
| | idx_train = np.arange(len(self.data.idx_train)) |
| | else: |
| | idx_train = self.data.idx_train |
| | labels_train = self.data.labels_train |
| | idx_selected = [] |
| |
|
| | |
| | for class_id, cnt in num_class_dict.items(): |
| | idx = idx_train[labels_train==class_id] |
| | features = embeds[idx] |
| | mean = torch.mean(features, dim=0, keepdim=True) |
| | selected = [] |
| | idx_left = np.arange(features.shape[0]).tolist() |
| |
|
| | for i in range(cnt): |
| | det = mean*(i+1) - torch.sum(features[selected], dim=0) |
| | dis = torch.cdist(det, features[idx_left]) |
| | id_min = torch.argmin(dis) |
| | selected.append(idx_left[id_min]) |
| | del idx_left[id_min] |
| | idx_selected.append(idx[selected]) |
| | |
| | return np.hstack(idx_selected) |
| |
|
| |
|
| | class Random(Base): |
| |
|
| | def __init__(self, data, args, device='cuda', **kwargs): |
| | super(Random, self).__init__(data, args, device='cuda', **kwargs) |
| |
|
| | def select(self, embeds, inductive=False): |
| | num_class_dict = self.num_class_dict |
| | if inductive: |
| | idx_train = np.arange(len(self.data.idx_train)) |
| | else: |
| | idx_train = self.data.idx_train |
| |
|
| | labels_train = self.data.labels_train |
| | idx_selected = [] |
| |
|
| | for class_id, cnt in num_class_dict.items(): |
| | idx = idx_train[labels_train==class_id] |
| | selected = np.random.permutation(idx) |
| | idx_selected.append(selected[:cnt]) |
| |
|
| | |
| | return np.hstack(idx_selected) |
| |
|
| |
|
| | class LRMC(Base): |
| | """ |
| | Coreset selection using precomputed seed nodes from the Laplacian‑Integrated |
| | Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON |
| | file specified by ``args.lrmc_seeds_path`` and used to preferentially select |
| | training examples. Per‑class reduction counts are respected: if a class has |
| | fewer seeds than required, random training nodes from that class are added |
| | until the quota is met. |
| | """ |
| |
|
| | def __init__(self, data, args, device='cuda', **kwargs): |
| | super(LRMC, self).__init__(data, args, device=device, **kwargs) |
| | seeds_path = getattr(args, 'lrmc_seeds_path', None) |
| | if seeds_path is None: |
| | raise ValueError( |
| | "LRMC method selected but no path to seed file provided. " |
| | "Please specify --lrmc_seeds_path when running the training script." |
| | ) |
| | self.seed_nodes = self._load_seed_nodes(seeds_path) |
| |
|
| | def _load_seed_nodes(self, path: str): |
| | |
| | with open(path, 'r') as f: |
| | js = json.load(f) |
| | clusters = js.get('clusters', []) |
| | if not clusters: |
| | raise ValueError(f"No clusters found in L‑RMC seeds file {path}") |
| | def _cluster_length(c): |
| | nodes = c.get('seed_nodes') or c.get('members') or [] |
| | return len(nodes) |
| | best_cluster = max(clusters, key=_cluster_length) |
| | nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or [] |
| | seed_nodes = [] |
| | for u in nodes: |
| | try: |
| | uid = int(u) |
| | except Exception: |
| | continue |
| | zero_idx = uid - 1 |
| | if zero_idx >= 0: |
| | seed_nodes.append(zero_idx) |
| | else: |
| | if uid >= 0: |
| | seed_nodes.append(uid) |
| | seed_nodes = sorted(set(seed_nodes)) |
| | return seed_nodes |
| |
|
| | def select(self, embeds, inductive=False): |
| | |
| | if inductive: |
| | idx_train = np.arange(len(self.data.idx_train)) |
| | labels_train = self.data.labels_train |
| | else: |
| | idx_train = self.data.idx_train |
| | labels_train = self.data.labels_train |
| | num_class_dict = self.num_class_dict |
| | idx_selected = [] |
| | seed_set = set(self.seed_nodes) |
| | |
| | for class_id, cnt in num_class_dict.items(): |
| | class_mask = (labels_train == class_id) |
| | class_indices = idx_train[class_mask] |
| | seed_in_class = [u for u in class_indices if u in seed_set] |
| | selected = seed_in_class[:min(len(seed_in_class), cnt)] |
| | remaining_required = cnt - len(selected) |
| | if remaining_required > 0: |
| | remaining_candidates = [u for u in class_indices if u not in selected] |
| | if len(remaining_candidates) <= remaining_required: |
| | additional = remaining_candidates |
| | else: |
| | additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist() |
| | selected += additional |
| | idx_selected.append(np.array(selected)) |
| | return np.hstack(idx_selected) |
| |
|
| |
|
| |
|