from sklearn import preprocessing from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold from sklearn.model_selection import cross_val_score from sklearn.svm import SVC, LinearSVC from torch.nn import Sequential, Linear, ReLU from torch_geometric.data import DataLoader from torch_geometric.datasets import TUDataset from torch_geometric.nn import GINConv, global_add_pool from tqdm import tqdm import numpy as np import os.path as osp import sys import torch import torch.nn.functional as F class Encoder(torch.nn.Module): def __init__(self, num_features, dim, num_gc_layers): super(Encoder, self).__init__() # num_features = dataset.num_features # dim = 32 self.num_gc_layers = num_gc_layers # self.nns = [] self.convs = torch.nn.ModuleList() self.bns = torch.nn.ModuleList() for i in range(num_gc_layers): if i: nn = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) else: nn = Sequential(Linear(num_features, dim), ReLU(), Linear(dim, dim)) conv = GINConv(nn) bn = torch.nn.BatchNorm1d(dim) self.convs.append(conv) self.bns.append(bn) def forward(self, x, edge_index, batch): if x is None: x = torch.ones((batch.shape[0], 1)).to(device) xs = [] for i in range(self.num_gc_layers): x = F.relu(self.convs[i](x, edge_index)) x = self.bns[i](x) xs.append(x) # if i == 2: # feature_map = x2 xpool = [global_add_pool(x, batch) for x in xs] x = torch.cat(xpool, 1) return x, torch.cat(xs, 1) def get_embeddings(self, loader): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ret = [] y = [] with torch.no_grad(): for data in loader: data.to(device) x, edge_index, batch = data.x, data.edge_index, data.batch if x is None: x = torch.ones((batch.shape[0],1)).to(device) x, _ = self.forward(x, edge_index, batch) ret.append(x.cpu().numpy()) # y.append(data.aid) ret = np.concatenate(ret, 0) # y = np.concatenate(y, 0) return ret # return ret, y class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() try: num_features = dataset.num_features except: num_features = 1 dim = 32 self.encoder = Encoder(num_features, dim) self.fc1 = Linear(dim*5, dim) self.fc2 = Linear(dim, dataset.num_classes) def forward(self, x, edge_index, batch): if x is None: x = torch.ones(batch.shape[0]).to(device) x, _ = self.encoder(x, edge_index, batch) x = F.relu(self.fc1(x)) x = F.dropout(x, p=0.5, training=self.training) x = self.fc2(x) return F.log_softmax(x, dim=-1) def train(epoch): model.train() if epoch == 51: for param_group in optimizer.param_groups: param_group['lr'] = 0.5 * param_group['lr'] loss_all = 0 for data in train_loader: data = data.to(device) optimizer.zero_grad() # print(data.x.shape) # [ num_nodes x num_node_labels ] # print(data.edge_index.shape) # [2 x num_edges ] # print(data.batch.shape) # [ num_nodes ] output = model(data.x, data.edge_index, data.batch) loss = F.nll_loss(output, data.y) loss.backward() loss_all += loss.item() * data.num_graphs optimizer.step() return loss_all / len(train_dataset) def test(loader): model.eval() correct = 0 for data in loader: data = data.to(device) output = model(data.x, data.edge_index, data.batch) pred = output.max(dim=1)[1] correct += pred.eq(data.y).sum().item() return correct / len(loader.dataset) if __name__ == '__main__': for percentage in [ 1.]: for DS in [sys.argv[1]]: if 'REDDIT' in DS: epochs = 200 else: epochs = 100 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', DS) accuracies = [[] for i in range(epochs)] #kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None) dataset = TUDataset(path, name=DS) #.shuffle() num_graphs = len(dataset) print('Number of graphs', len(dataset)) dataset = dataset[:int(num_graphs * percentage)] dataset = dataset.shuffle() kf = KFold(n_splits=10, shuffle=True, random_state=None) for train_index, test_index in kf.split(dataset): # x_train, x_test = x[train_index], x[test_index] # y_train, y_test = y[train_index], y[test_index] train_dataset = [dataset[int(i)] for i in list(train_index)] test_dataset = [dataset[int(i)] for i in list(test_index)] print('len(train_dataset)', len(train_dataset)) print('len(test_dataset)', len(test_dataset)) train_loader = DataLoader(train_dataset, batch_size=128) test_loader = DataLoader(test_dataset, batch_size=128) # print('train', len(train_loader)) # print('test', len(test_loader)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Net().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) for epoch in range(1, epochs+1): train_loss = train(epoch) train_acc = test(train_loader) test_acc = test(test_loader) accuracies[epoch-1].append(test_acc) tqdm.write('Epoch: {:03d}, Train Loss: {:.7f}, ' 'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(epoch, train_loss, train_acc, test_acc)) tmp = np.mean(accuracies, axis=1) print(percentage, DS, np.argmax(tmp), np.max(tmp), np.std(accuracies[np.argmax(tmp)])) input()