summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'main.py')
-rw-r--r--main.py344
1 files changed, 344 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..ed1adb1
--- /dev/null
+++ b/main.py
@@ -0,0 +1,344 @@
+#coding:utf-8
+
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+from sklearn import metrics
+from sklearn.metrics import roc_curve
+from sklearn.metrics import auc
+from tqdm import tqdm
+import random
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import average_precision_score
+from sklearn.metrics import f1_score,confusion_matrix
+from sklearn.model_selection import train_test_split
+import numpy as np
+from scipy import interp
+import os
+#import seaborn as sns
+import matplotlib.pyplot as plt
+#import nni
+import pandas as pd
+
+#from process_dataset import load_data
+from DDoS2019 import load_data,separate_data,load_data2
+from models.graphcnn import GraphCNN
+
+criterion = nn.CrossEntropyLoss()
+pd.set_option('display.max_columns', None) # 显示所有列
+pd.set_option('display.max_rows', None) # 显示所有行
+def train(args, model, device, train_graphs, optimizer, epoch):
+ model.train()
+
+ total_iters = args.iters_per_epoch
+ pbar = tqdm(range(total_iters), unit='batch')#进度条,返回一个迭代器
+
+ loss_accum = 0
+ for pos in pbar:
+ #print("length of train graph of train: %s" %len(train_graphs))
+ selected_idx = np.random.permutation(len(train_graphs))[:args.batch_size]#洗牌
+
+ batch_graph = [train_graphs[idx] for idx in selected_idx]
+ output = model(batch_graph)
+
+ labels = torch.LongTensor([graph.label for graph in batch_graph]).to(device)
+
+ #compute loss
+ loss = criterion(output, labels)
+
+ #backprop
+ if optimizer is not None:
+ optimizer.zero_grad()#梯度置零
+ loss.backward() #反向传播,计算当前梯度
+ optimizer.step()#根据当前梯度更新参数
+
+
+ loss = loss.detach().cpu().numpy()
+ loss_accum += loss
+
+ #report
+ pbar.set_description('epoch: %d' % (epoch))
+
+ average_loss = loss_accum/total_iters
+ print("loss training: %f" % (average_loss))
+
+ return average_loss
+
+###pass data to model with minibatch during testing to avoid memory overflow (does not perform backpropagation)
+def pass_data_iteratively(model, graphs, minibatch_size = 64):
+ model.eval()
+ output = []
+ idx = np.arange(len(graphs)) # 步长为1,返回numpy.ndarray类型,是一个序列,可被用于作为向量,支持步长为小数
+ for i in range(0, len(graphs), minibatch_size):
+ sampled_idx = idx[i:i+minibatch_size]
+ if len(sampled_idx) == 0:
+ continue
+ output.append(model([graphs[j] for j in sampled_idx]).detach())
+ return torch.cat(output, 0)
+
+def test(args, model, device, train_graphs, test_graphs, epoch):
+ model.eval()
+
+ output = pass_data_iteratively(model, train_graphs)
+ #print(output.size())
+
+ #print(output_1)
+ #output_2 = F.softmax(output, dim=1)
+ pred = output.max(1, keepdim=True)[1]
+
+
+ labels = torch.LongTensor([graph.label for graph in train_graphs]).to(device)
+ correct = pred.eq(labels.view_as(pred)).sum().cpu().item()
+ acc_train = correct / float(len(train_graphs))
+ output = pass_data_iteratively(model, test_graphs)
+ pred = output.max(1, keepdim=True)[1]
+ labels = torch.LongTensor([graph.label for graph in test_graphs]).to(device)
+ correct = pred.eq(labels.view_as(pred)).sum().cpu().item()
+ acc_test = correct / float(len(test_graphs))
+
+ scores= output.max(1, keepdim=True)[0]
+ labels1=labels.view_as(pred)
+ labels2=[labels1[i].item() for i in range(len(labels1))]
+ pred1=[pred[i].item() for i in range(len(pred))]
+ scores2=[scores[i].item() for i in range(len(scores))]
+
+ output_1 = F.softmax(output, dim=1)
+ #ROC(labels2, output_1)
+
+ print(metrics.classification_report(labels2, pred1))
+ c2=confusion_matrix(labels2,pred1,labels=range(13))
+ print(c2)
+
+
+ print("accuracy train: %f test: %f" % (acc_train, acc_test))
+
+ return acc_train, acc_test
+
+
+def ROC(y, y_score):
+ y_onehot = np.zeros((len(y), 13))
+ y_onehot[range(len(y)), [y[i] for i in range(len(y))]] = 1
+ y_score = y_score.cpu().numpy()
+ fpr = dict()
+ tpr = dict()
+ roc_auc = dict()
+ # 计算每一类的roc
+ for i in range(13):
+ fpr[i], tpr[i], _ = roc_curve(y_onehot[:, i], y_score[:, i])
+ roc_auc[i] = auc(fpr[i], tpr[i])
+
+ # 计算micro-average roc curve and ROC area
+ fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot.ravel(), y_score.ravel())
+ roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
+
+ # 计算macro-average ROC
+
+ all_fpr = np.unique(np.concatenate([fpr[i] for i in range(13)]))
+ mean_tpr = np.zeros_like(all_fpr)
+ for i in range(13):
+ mean_tpr += interp(all_fpr, fpr[i], tpr[i])
+ mean_tpr /= 13
+ fpr["macro"] = all_fpr
+ tpr["macro"] = mean_tpr
+ roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
+
+ # plot ROC curve
+ plt.figure()
+ plt.plot(fpr["micro"], tpr["micro"],
+ label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
+ color = 'deeppink', linestyle=':', linewidth=4)
+
+ plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'
+ ''.format(roc_auc["macro"]),
+ color='navy', linestyle=':', linewidth=4)
+
+ plt.plot([0, 1], [0, 1], 'k--', lw=2)
+ plt.xlim([0.0, 1.0])
+ plt.ylim([0.0, 1.05])
+ plt.xlabel('False Positive Rate')
+ plt.ylabel('True Positive Rate')
+ plt.show()
+
+
+def tra(para,model,device,train_graphs,test_graphs,feature):
+ print(f"train_num: {len(train_graphs)}/n test_graph:{len(test_graphs)}")
+ optimizer = optim.Adam(model.parameters(), lr=para.lr)
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
+
+ best_accuracy=0
+ for epoch in range(1, para.epochs + 1):
+ #scheduler.step()
+ #print("length train_graphs:%s" % len(train_graphs))
+ avg_loss = train(para, model, device, train_graphs, optimizer, epoch)
+ scheduler.step()
+ acc_train, acc_test = test(para, model, device, train_graphs, test_graphs, epoch)
+ #nni.report_intermediate_result(acc_test)
+ if acc_test > best_accuracy:
+ path = "./529"+"".join(str(feature))+".pth"
+ torch.save(model.state_dict(), path)
+ best_accuracy = acc_test
+
+ '''if not args.filename == "":
+ with open(args.filename, 'w') as f:
+ f.write("%f %f %f" % (avg_loss, acc_train, acc_test))
+ f.write("\n")
+ print("")'''
+
+ print(model.eps)
+ #nni.report_final_result(best_accuracy)
+
+def te(model,graphs,device,num_classes,feature):
+ #path = "529"+''.join(feature)+".pth"
+ path = "./529"+"".join(str(feature))+".pth"
+ model.load_state_dict(torch.load(path))
+
+ output = pass_data_iteratively(model,graphs)
+ pred = output.max(1, keepdim=True)[1]
+ scores= output.max(1, keepdim=True)[0]
+ labels = torch.LongTensor([graph.label for graph in graphs]).to(device)
+ correct = pred.eq(labels.view_as(pred)).sum().cpu().item()
+
+ labels1=labels.view_as(pred)
+ labels2=[labels1[i].item() for i in range(len(labels1))]
+ pred1=[pred[i].item() for i in range(len(pred))]
+ scores2=[scores[i].item() for i in range(len(scores))]
+ output_1 = F.softmax(output, dim=1)
+ r = pd.DataFrame(metrics.classification_report(labels2, pred1, output_dict=True)).transpose()
+ print(r)# str
+ c2 = confusion_matrix(labels2, pred1, labels=range(num_classes)) # numpy.ndarrays
+ print(c2)
+ acc = correct / float(len(graphs))
+ print("accuracy:%s" %acc)
+
+ return r, c2, acc
+
+def train_test(args, device):
+ report = None
+ confuse = None
+ acc = None
+ rseeds = [1, 2, 4, 7, 9]
+ flags = False
+ for rseed in rseeds:
+ num_classes, train, test = load_data2(args.degree_as_tag, args.feature, rseed)
+
+ # 以下是算每个类train的数量
+ train_num={}
+ for i in range(num_classes):
+ train_num[i] = 0
+ for l in train:
+ train_num[l.label] += 1
+ for i in range(num_classes):
+ print("训练类 %d 数量:%s" % (i, train_num[i]))
+ #train_DDoS=random.sample(train_DDoS,len(train_DDoS)*0.01)
+ #train_norm=random.sample(train_norm,len(train_DDoS))
+ #for k in range(len(train_num)):
+ # print("训练类%s的数量:%s" % (k,train_num[k]))
+ #print("训练正常样本数量:%s" % len(train_norm))
+ #train_norm.extend(train_DDoS)
+ #train_graphs=train
+ #random.shuffle(train_graphs)
+ print("训练所有样本数量:%s" % len(train))
+ print("测试集数量:%s" % len(test))
+ '''if(not os.path.exists("train.npy")):
+ np.save(file="train.npy")
+ if(not os.path.exists("test.npy")):
+ np.save(file="test.npy")'''
+ ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
+ train_graphs, var_graphs = separate_data(train, args.seed, args.fold_idx)
+
+ model = GraphCNN(args.num_layers, args.num_mlp_layers,train_graphs[0].node_features.shape[1],args.hidden_dim,num_classes,args.final_dropout,args.learn_eps,args.graph_pooling_type,args.neighbor_pooling_type,device).to(device)
+ #print(model)
+ #model = GraphCNN(args.num_layers, args.num_mlp_layers, train_graphs[0].node_features.shape[1], args.hidden_dim, num_classes, args.final_dropout, args.learn_eps, args.graph_pooling_type, args.neighbor_pooling_type, device).to(device)
+ tra(args,model,device,train_graphs,var_graphs,args.feature)
+ report_, confuse_, acc_ = te(model, test, device, num_classes, args.feature)
+ if not flags:
+ report = report_
+ confuse = confuse_
+ acc = acc_
+ flags = True
+ else:
+ report = report.combine(report_, lambda s1, s2: (s1 + s2) / 2)
+ confuse = (confuse + confuse_) / 2 # np.mean(np.array([confuse, confuse_], axis = 0))
+ acc = (acc + acc_) / 2
+
+ np.set_printoptions(suppress=True)
+ print(f"_____average report_____\n{report}")
+ print(f"_____average confuse_____\n{confuse}")
+ print(f"_____average accuracy_____\n{acc}")
+
+def main():
+ # Training settings
+ # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper.
+ parser = argparse.ArgumentParser(description='PyTorch graph convolutional neural net for whole-graph classification')
+ parser.add_argument('--dataset', type=str, default="MUTAG",
+ help='name of dataset (default: MUTAG)')
+ parser.add_argument('--device', type=int, default=0,
+ help='which gpu to use if any (default: 0)')
+ parser.add_argument('--batch_size', type=int, default=300,
+ help='input batch size for training (default: 200)')
+ parser.add_argument('--iters_per_epoch', type=int, default=200,
+ help='number of iterations per each epoch (default: 50)')
+ parser.add_argument('--epochs', type=int, default=2,
+ help='number of epochs to train (default: 350)')
+ parser.add_argument('--lr', type=float, default=0.1,
+ help='learning rate (default: 0.01)')
+ parser.add_argument('--seed', type=int, default=0,
+ help='random seed for splitting the dataset into 10 (default: 0)')
+ parser.add_argument('--fold_idx', type=int, default=2,
+ help='the index of fold in 10-fold validation. Should be less then 10.')
+ parser.add_argument('--num_layers', type=int, default=5,
+ help='number of layers INCLUDING the input one (default: 5)')
+ parser.add_argument('--num_mlp_layers', type=int, default=2,
+ help='number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.')
+ parser.add_argument('--hidden_dim', type=int, default=128,
+ help='number of hidden units (default: 64)')
+ parser.add_argument('--final_dropout', type=float, default=0.3,
+ help='final layer dropout (default: 0.5)')
+ parser.add_argument('--graph_pooling_type', type=str, default="sum", choices=["sum", "average"],
+ help='Pooling for over nodes in a graph: sum or average')
+ parser.add_argument('--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average", "max"],
+ help='Pooling for over neighboring nodes: sum, average or max')
+ parser.add_argument('--learn_eps', action="store_true",
+ help='Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.')
+ parser.add_argument('--degree_as_tag', action="store_true",
+ help='let the input node features be the degree of nodes (heuristics for unlabeled graph)')
+ parser.add_argument('--filename', type = str, default = "gnn.txt",
+ help='output file')
+ parser.add_argument('--feature', type = str, default = '0,1,3,15,17,19',
+ help='output file')
+ args = parser.parse_args()
+
+ #set up seeds and gpu device
+ torch.manual_seed(0)
+ np.random.seed(0)
+ device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed_all(0)
+ '''para={
+ "num_layers: 5,
+ "num_mlp_layers": 2,
+ "hidden_dim":128,
+ "final_dropout": 0.3,
+ "graph_pooling_type": 'sum',
+ "neighbor_pooling_type":"sum",
+ "lr":0.0025,
+ "batch_size": 300,
+ "iters_per_epoch":200,
+ "epochs": 30,
+ "num_node": 30
+ }
+
+ receive=nni.get_next_parameter()
+ para.update(receive)'''
+
+ # for fe in range(5):
+ graphs, num_classes = load_data(args.degree_as_tag, args)
+ # train_test(args, device)
+
+
+
+if __name__ == '__main__':
+ main()