# -*- coding: utf-8 -*- """edgeClassification.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6 """ # from google.colab import drive # drive.mount('/content/drive') # # !pip install dgl==0.6.1 import dgl import numpy as np import pandas as pd import torch as th import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import dgl.function as fn import dgl.nn.pytorch as dglnn import time import argparse from dgl.data import * import tqdm from sklearn import metrics import warnings warnings.filterwarnings('ignore') argparser = argparse.ArgumentParser("edge classification training") argparser.add_argument('--num-epochs', type=int, default=30) argparser.add_argument('--num-hidden', type=int, default=64) argparser.add_argument('--num-layers', type=int, default=2) argparser.add_argument('--fan-out', type=str, default='10,25') ## 第一层随机采样10个,第二层采样25个 argparser.add_argument('--batch-size', type=int, default=2048) argparser.add_argument('--lr', type=float, default=0.005) argparser.add_argument('--dropout', type=float, default=0.5) argparser.add_argument('--num-workers', type=int, default=0, help="Number of sampling processes. Use 0 for no extra process.") args = argparser.parse_args([]) import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize nodes=pd.read_csv("training_nodes1.csv") node_names=np.array(nodes)[:,1] print(len(node_names)) # Tokenization of each name tokenized_node_names = [] for node_name in node_names: tokenized_node_names.append(word_tokenize(node_name.lower())) print(tokenized_node_names) from gensim.models.doc2vec import Doc2Vec, TaggedDocument tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_node_names)] model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100) ''' vector_size = Dimensionality of the feature vectors. window = The maximum distance between the current and predicted word within a sentence. min_count = Ignores all words with total frequency lower than this. alpha = The initial learning rate. ''' node_text_feature_list=[] for text in tokenized_node_names: node_text_feature_list.append(model.infer_vector(text)) print(node_text_feature_list) #将节点类型先归一化,然后添加到节点特征中 node_types=np.array(nodes)[:,2] type_feature_list=[] for type_value in node_types: type_value=int(type_value) type_value=type_value/3 #min-max归一化 type_feature_list.append(type_value) print(type_feature_list) print(len(type_feature_list)) for i in range(0,len(node_text_feature_list)): node_text_feature_list[i]=np.append(node_text_feature_list[i],type_feature_list[i]) node_feature_array=node_text_feature_list[0] for i in range(1,len(node_text_feature_list)): node_feature_array=np.vstack((node_feature_array,node_text_feature_list[i])) print(node_feature_array) edge_classes = 2 ## 关系数量,也就是边分类的分类数 fraud_edges=pd.read_csv("fraud_edges_training_index_only.csv") src=np.array(fraud_edges)[:,0] dst=np.array(fraud_edges)[:,1] legi_edges=pd.read_csv("legi_edges_training_index_only.csv") src=np.concatenate([src,np.array(legi_edges)[:,0]]) dst=np.concatenate([dst,np.array(legi_edges)[:,1]]) print(len(src)) print(len(dst)) # 同时建立反向边 email_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src]))) # 建立点和边特征,以及边的标签 email_graph.ndata['feature'] = th.tensor(node_feature_array) print(email_graph.ndata['feature']) fraud_type_array=np.array(fraud_edges)[:,2] fraud_label_array=np.ones((len(fraud_type_array)))#欺诈为1 legi_type_array=np.array(legi_edges)[:,2] type_array=np.concatenate([fraud_type_array,legi_type_array]) legi_label_array=np.zeros((len(legi_type_array)))#良性为0 label_array=np.concatenate([fraud_label_array,legi_label_array]) #为反向边也加上type,label type_array=np.concatenate([type_array,type_array]) label_array=np.concatenate([label_array,label_array]) print(len(label_array)) edge_feature_array=[] #边类型归一化 for edge_type in type_array: edge_type=edge_type/4 edge_feature_array.append(edge_type) # print(len(edge_feature_array)) email_graph.edata['feature'] = th.tensor(edge_feature_array) print(email_graph.edata['feature']) email_graph.edata['label'] = th.LongTensor(label_array) print(email_graph.edata['label']) print(email_graph) # 随机进行训练、验证和测试集划分 6:2:2 train_len=int(email_graph.number_of_edges() * 0.6) val_len=int(email_graph.number_of_edges() * 0.2) test_len=email_graph.number_of_edges()-train_len-val_len pre_train=np.zeros((train_len)) pre_val=np.ones((val_len)) pre_test=np.linspace(2,100,test_len) pre_split=np.concatenate([np.concatenate([pre_train,pre_val]),pre_test]) print(pre_split) np.random.shuffle(pre_split) print(pre_split) train_id_list,val_id_list,test_id_list=[],[],[] for i in range(0,len(pre_split)): if pre_split[i]==0: train_id_list.append(i) elif pre_split[i]==1: val_id_list.append(i) else: test_id_list.append(i) train_id,val_id,test_id=th.tensor(train_id_list),th.tensor(val_id_list),th.tensor(test_id_list) print(train_id.shape,val_id.shape,test_id.shape) #采样,然后mini-batch训练 n_edges=email_graph.num_edges() # Create sampler sampler = dgl.dataloading.MultiLayerNeighborSampler( [int(fanout) for fanout in args.fan_out.split(',')]) dataloader = dgl.dataloading.EdgeDataLoader( email_graph, train_id, sampler, exclude='reverse_id', # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊” # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), batch_size=args.batch_size, shuffle=True, drop_last=False, num_workers=args.num_workers) ## For debug for dd in dataloader: """分别是input_nodes, edge_subgraph,blocks""" for xx in dd: print(xx) print() break class TwoLayerGCN(nn.Module): def __init__(self, in_dim, hid_dim, out_dim): """两层的GCN模型""" super().__init__() self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True) self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True) def forward(self, blocks, x): x = F.relu(self.conv1(blocks[0], x)) x = F.relu(self.conv2(blocks[1], x)) return x class Predictor(nn.Module): """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出""" def __init__(self, in_dim, num_classes): super().__init__() self.W = nn.Linear(2 * in_dim, num_classes) def apply_edges(self, edges): data = th.cat([edges.src['x'], edges.dst['x']], dim=-1) return {'score': self.W(data)} def forward(self, edge_subgraph, x): with edge_subgraph.local_scope(): edge_subgraph.ndata['x'] = x edge_subgraph.apply_edges(self.apply_edges) return edge_subgraph.edata['score'] class MyModel(nn.Module): """主模型:结构比较清晰""" def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes): super().__init__() self.node_emb = nn.Embedding(num_nodes, emb_dim) self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim) self.predictor = Predictor(out_dim, num_classes) def forward(self, edge_subgraph, blocks, input_nodes): x = self.node_emb(input_nodes) x = self.gcn(blocks, x) return self.predictor(edge_subgraph, x) device = th.device("cuda" if th.cuda.is_available() else "cpu") print("device: {}".format(device)) model = MyModel(64, args.num_hidden, args.num_hidden, edge_classes, email_graph.num_nodes()) model = model.to(device) loss_fcn = nn.CrossEntropyLoss() # 交叉熵损失 optimizer = optim.Adam(model.parameters(), lr=args.lr) def predict(model, email_graph, val_id, device): # Create sampler(全采样) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2) dataloader = dgl.dataloading.EdgeDataLoader( email_graph, val_id, sampler, exclude='reverse_id', # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=args.num_workers) valid_preds = [] model.eval() with th.no_grad(): for input_nodes, edges_subgraph, blocks in dataloader: edges_subgraph = edges_subgraph.to(device) blocks = [block.int().to(device) for block in blocks] pred = model(edges_subgraph, blocks, input_nodes) pred = pred.cpu().argmax(-1).numpy().tolist() valid_preds.extend(pred) return valid_preds labels = email_graph.edata['label'] # 图中所有边的标签 best_val_acc = 0 # 记录验证集上的最好效果 patience = 0 # For early stopping # Training loop for epoch in range(args.num_epochs): # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. start_time = time.time() all_loss = [] trn_label, trn_pred = [], [] n_batches = len(dataloader) for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader): edges_subgraph = edges_subgraph.to(device) blocks = [block.to(device) for block in blocks] # Compute loss and prediction edge_preds = model(edges_subgraph, blocks, input_nodes) loss = loss_fcn(edge_preds, edges_subgraph.edata['label']) optimizer.zero_grad() loss.backward() optimizer.step() all_loss.append(loss.item()) trn_label.extend(edges_subgraph.edata['label'].cpu().numpy().tolist()) trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist()) if (step+1) % (n_batches//10) == 0: cur_acc = metrics.accuracy_score(trn_label, trn_pred) print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format( epoch+1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time)) ## 验证集预测 val_preds = predict(model, email_graph, val_id, device) val_acc = metrics.accuracy_score(labels[val_id], val_preds) if val_acc > best_val_acc: best_val_acc = val_acc patience = 0 th.save(model.state_dict(), "edge_cls_best_model.bin") else: patience += 1 print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format( val_acc, best_val_acc, time.time() - start_time)) ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练 if patience > 3: break model.load_state_dict(th.load("edge_cls_best_model.bin")) test_preds = predict(model, email_graph, test_id, device) print("test acc: {:.4f}".format(metrics.accuracy_score(labels[test_id], test_preds)))