LuyangZ's picture
Upload 30 files
01df1d6 verified
raw
history blame
No virus
6.55 kB
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC, LinearSVC
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.data import DataLoader
from torch_geometric.datasets import TUDataset
from torch_geometric.nn import GINConv, global_add_pool
from tqdm import tqdm
import numpy as np
import os.path as osp
import sys
import torch
import torch.nn.functional as F
class Encoder(torch.nn.Module):
def __init__(self, num_features, dim, num_gc_layers):
super(Encoder, self).__init__()
# num_features = dataset.num_features
# dim = 32
self.num_gc_layers = num_gc_layers
# self.nns = []
self.convs = torch.nn.ModuleList()
self.bns = torch.nn.ModuleList()
for i in range(num_gc_layers):
if i:
nn = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
else:
nn = Sequential(Linear(num_features, dim), ReLU(), Linear(dim, dim))
conv = GINConv(nn)
bn = torch.nn.BatchNorm1d(dim)
self.convs.append(conv)
self.bns.append(bn)
def forward(self, x, edge_index, batch):
if x is None:
x = torch.ones((batch.shape[0], 1)).to(device)
xs = []
for i in range(self.num_gc_layers):
x = F.relu(self.convs[i](x, edge_index))
x = self.bns[i](x)
xs.append(x)
# if i == 2:
# feature_map = x2
xpool = [global_add_pool(x, batch) for x in xs]
x = torch.cat(xpool, 1)
return x, torch.cat(xs, 1)
def get_embeddings(self, loader):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ret = []
y = []
with torch.no_grad():
for data in loader:
data.to(device)
x, edge_index, batch = data.x, data.edge_index, data.batch
if x is None:
x = torch.ones((batch.shape[0],1)).to(device)
x, _ = self.forward(x, edge_index, batch)
ret.append(x.cpu().numpy())
# y.append(data.aid)
ret = np.concatenate(ret, 0)
# y = np.concatenate(y, 0)
return ret
# return ret, y
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
try:
num_features = dataset.num_features
except:
num_features = 1
dim = 32
self.encoder = Encoder(num_features, dim)
self.fc1 = Linear(dim*5, dim)
self.fc2 = Linear(dim, dataset.num_classes)
def forward(self, x, edge_index, batch):
if x is None:
x = torch.ones(batch.shape[0]).to(device)
x, _ = self.encoder(x, edge_index, batch)
x = F.relu(self.fc1(x))
x = F.dropout(x, p=0.5, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=-1)
def train(epoch):
model.train()
if epoch == 51:
for param_group in optimizer.param_groups:
param_group['lr'] = 0.5 * param_group['lr']
loss_all = 0
for data in train_loader:
data = data.to(device)
optimizer.zero_grad()
# print(data.x.shape)
# [ num_nodes x num_node_labels ]
# print(data.edge_index.shape)
# [2 x num_edges ]
# print(data.batch.shape)
# [ num_nodes ]
output = model(data.x, data.edge_index, data.batch)
loss = F.nll_loss(output, data.y)
loss.backward()
loss_all += loss.item() * data.num_graphs
optimizer.step()
return loss_all / len(train_dataset)
def test(loader):
model.eval()
correct = 0
for data in loader:
data = data.to(device)
output = model(data.x, data.edge_index, data.batch)
pred = output.max(dim=1)[1]
correct += pred.eq(data.y).sum().item()
return correct / len(loader.dataset)
if __name__ == '__main__':
for percentage in [ 1.]:
for DS in [sys.argv[1]]:
if 'REDDIT' in DS:
epochs = 200
else:
epochs = 100
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', DS)
accuracies = [[] for i in range(epochs)]
#kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
dataset = TUDataset(path, name=DS) #.shuffle()
num_graphs = len(dataset)
print('Number of graphs', len(dataset))
dataset = dataset[:int(num_graphs * percentage)]
dataset = dataset.shuffle()
kf = KFold(n_splits=10, shuffle=True, random_state=None)
for train_index, test_index in kf.split(dataset):
# x_train, x_test = x[train_index], x[test_index]
# y_train, y_test = y[train_index], y[test_index]
train_dataset = [dataset[int(i)] for i in list(train_index)]
test_dataset = [dataset[int(i)] for i in list(test_index)]
print('len(train_dataset)', len(train_dataset))
print('len(test_dataset)', len(test_dataset))
train_loader = DataLoader(train_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)
# print('train', len(train_loader))
# print('test', len(test_loader))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, epochs+1):
train_loss = train(epoch)
train_acc = test(train_loader)
test_acc = test(test_loader)
accuracies[epoch-1].append(test_acc)
tqdm.write('Epoch: {:03d}, Train Loss: {:.7f}, '
'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(epoch, train_loss,
train_acc, test_acc))
tmp = np.mean(accuracies, axis=1)
print(percentage, DS, np.argmax(tmp), np.max(tmp), np.std(accuracies[np.argmax(tmp)]))
input()