summaryrefslogtreecommitdiff
path: root/code/edgeClassificationEmailGraph.py
blob: d5b4c806647eb4f22d8123f44515553bc44fd3af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# -*- coding: utf-8 -*-
"""edgeClassification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6
"""

# from google.colab import drive
# drive.mount('/content/drive')
#
# !pip install dgl==0.6.1

import dgl
import numpy as np
import pandas as pd
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from dgl.data import *
import tqdm
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

argparser = argparse.ArgumentParser("edge classification training")
argparser.add_argument('--num-epochs', type=int, default=30)
argparser.add_argument('--num-hidden', type=int, default=64)
argparser.add_argument('--num-layers', type=int, default=2)
argparser.add_argument('--fan-out', type=str, default='10,25')  ## 第一层随机采样10个,第二层采样25个
argparser.add_argument('--batch-size', type=int, default=2048)
argparser.add_argument('--lr', type=float, default=0.005)
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
                       help="Number of sampling processes. Use 0 for no extra process.")
args = argparser.parse_args([])

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

nodes=pd.read_csv("training_nodes1.csv")
node_names=np.array(nodes)[:,1]
print(len(node_names))
# Tokenization of each name
tokenized_node_names = [] 
for node_name in node_names: 
  tokenized_node_names.append(word_tokenize(node_name.lower()))
print(tokenized_node_names)

from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_node_names)] 
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)
'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

node_text_feature_list=[]
for text in tokenized_node_names:
  node_text_feature_list.append(model.infer_vector(text))
print(node_text_feature_list)

#将节点类型先归一化,然后添加到节点特征中
node_types=np.array(nodes)[:,2]
type_feature_list=[]
for type_value in node_types:
  type_value=int(type_value)
  type_value=type_value/3 #min-max归一化
  type_feature_list.append(type_value)
print(type_feature_list)
print(len(type_feature_list))

for i in range(0,len(node_text_feature_list)):
  node_text_feature_list[i]=np.append(node_text_feature_list[i],type_feature_list[i])
node_feature_array=node_text_feature_list[0]
for i in range(1,len(node_text_feature_list)):
  node_feature_array=np.vstack((node_feature_array,node_text_feature_list[i]))
print(node_feature_array)

edge_classes = 2   ## 关系数量,也就是边分类的分类数
fraud_edges=pd.read_csv("fraud_edges_training_index_only.csv")
src=np.array(fraud_edges)[:,0]
dst=np.array(fraud_edges)[:,1]
legi_edges=pd.read_csv("legi_edges_training_index_only.csv")
src=np.concatenate([src,np.array(legi_edges)[:,0]])
dst=np.concatenate([dst,np.array(legi_edges)[:,1]])
print(len(src))
print(len(dst))
# 同时建立反向边
email_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src])))
# 建立点和边特征,以及边的标签
email_graph.ndata['feature'] = th.tensor(node_feature_array)
print(email_graph.ndata['feature'])
fraud_type_array=np.array(fraud_edges)[:,2]
fraud_label_array=np.ones((len(fraud_type_array)))#欺诈为1
legi_type_array=np.array(legi_edges)[:,2]
type_array=np.concatenate([fraud_type_array,legi_type_array])
legi_label_array=np.zeros((len(legi_type_array)))#良性为0
label_array=np.concatenate([fraud_label_array,legi_label_array])
#为反向边也加上type,label
type_array=np.concatenate([type_array,type_array])
label_array=np.concatenate([label_array,label_array])
print(len(label_array))
edge_feature_array=[]
#边类型归一化
for edge_type in type_array:
  edge_type=edge_type/4
  edge_feature_array.append(edge_type)
# print(len(edge_feature_array))
email_graph.edata['feature'] = th.tensor(edge_feature_array)
print(email_graph.edata['feature'])
email_graph.edata['label'] = th.LongTensor(label_array)
print(email_graph.edata['label'])
print(email_graph)

# 随机进行训练、验证和测试集划分 6:2:2
train_len=int(email_graph.number_of_edges() * 0.6)
val_len=int(email_graph.number_of_edges() * 0.2)
test_len=email_graph.number_of_edges()-train_len-val_len
pre_train=np.zeros((train_len))
pre_val=np.ones((val_len))
pre_test=np.linspace(2,100,test_len)
pre_split=np.concatenate([np.concatenate([pre_train,pre_val]),pre_test])
print(pre_split)
np.random.shuffle(pre_split)
print(pre_split)
train_id_list,val_id_list,test_id_list=[],[],[]
for i in range(0,len(pre_split)):
  if pre_split[i]==0:
    train_id_list.append(i)
  elif pre_split[i]==1:
    val_id_list.append(i)
  else:
    test_id_list.append(i)
train_id,val_id,test_id=th.tensor(train_id_list),th.tensor(val_id_list),th.tensor(test_id_list)
print(train_id.shape,val_id.shape,test_id.shape)

#采样,然后mini-batch训练
n_edges=email_graph.num_edges()
# Create sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler(
            [int(fanout) for fanout in args.fan_out.split(',')])
dataloader = dgl.dataloading.EdgeDataLoader(
        email_graph, train_id, sampler,
        exclude='reverse_id',    # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊”
        # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
        reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False,
        num_workers=args.num_workers)

## For debug
for dd in dataloader:
    """分别是input_nodes, edge_subgraph,blocks"""
    for xx in dd:
        print(xx)
        print()
    break

class TwoLayerGCN(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim):
        """两层的GCN模型"""
        super().__init__()
        self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True)
        self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True)
    
    def forward(self, blocks, x):
        x = F.relu(self.conv1(blocks[0], x))
        x = F.relu(self.conv2(blocks[1], x))
        return x
    
class Predictor(nn.Module):
    """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出"""
    def __init__(self, in_dim, num_classes):
        super().__init__()
        self.W = nn.Linear(2 * in_dim, num_classes)
        
    def apply_edges(self, edges):
        data = th.cat([edges.src['x'], edges.dst['x']], dim=-1)
        return {'score': self.W(data)}
    
    def forward(self, edge_subgraph, x):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['x'] = x
            edge_subgraph.apply_edges(self.apply_edges)
            return edge_subgraph.edata['score']

class MyModel(nn.Module):
    """主模型:结构比较清晰"""
    def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes):
        super().__init__()
        self.node_emb = nn.Embedding(num_nodes, emb_dim)  
        self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim)
        self.predictor = Predictor(out_dim, num_classes)
        
    def forward(self, edge_subgraph, blocks, input_nodes):
        x = self.node_emb(input_nodes)
        x = self.gcn(blocks, x)
        return self.predictor(edge_subgraph, x)

device = th.device("cuda" if th.cuda.is_available() else "cpu")
print("device: {}".format(device))

model = MyModel(64, args.num_hidden, args.num_hidden, edge_classes, email_graph.num_nodes())
model = model.to(device)
loss_fcn = nn.CrossEntropyLoss()   # 交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=args.lr)


def predict(model, email_graph, val_id, device):
    # Create sampler(全采样)
    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
    dataloader = dgl.dataloading.EdgeDataLoader(
        email_graph, val_id, sampler, exclude='reverse_id',
        # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
        reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=args.num_workers)
    
    valid_preds = []
    model.eval()
    with th.no_grad():
        for input_nodes, edges_subgraph, blocks in dataloader:
            edges_subgraph = edges_subgraph.to(device)
            blocks = [block.int().to(device) for block in blocks]
            pred = model(edges_subgraph, blocks, input_nodes)
            pred = pred.cpu().argmax(-1).numpy().tolist()
            valid_preds.extend(pred)
    return valid_preds

labels = email_graph.edata['label']  # 图中所有边的标签
best_val_acc = 0  # 记录验证集上的最好效果
patience = 0  # For early stopping

# Training loop
for epoch in range(args.num_epochs):
    # Loop over the dataloader to sample the computation dependency graph as a list of
    # blocks.
    start_time = time.time()
    all_loss = []
    trn_label, trn_pred = [], []
    n_batches = len(dataloader)
    
    for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader):
        edges_subgraph = edges_subgraph.to(device)
        blocks = [block.to(device) for block in blocks]

        # Compute loss and prediction
        edge_preds = model(edges_subgraph, blocks, input_nodes)
        loss = loss_fcn(edge_preds, edges_subgraph.edata['label'])  
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        all_loss.append(loss.item())
        trn_label.extend(edges_subgraph.edata['label'].cpu().numpy().tolist())
        trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist())
        
        if (step+1) % (n_batches//10) == 0:
            cur_acc = metrics.accuracy_score(trn_label, trn_pred)
            print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format(
                  epoch+1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time))

    ## 验证集预测
    val_preds = predict(model, email_graph, val_id, device)
    val_acc = metrics.accuracy_score(labels[val_id], val_preds)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience = 0
        th.save(model.state_dict(), "edge_cls_best_model.bin")
    else:
        patience += 1
    print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format(
          val_acc, best_val_acc, time.time() - start_time))
    
    ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练
    if patience > 3:
        break

model.load_state_dict(th.load("edge_cls_best_model.bin"))

test_preds = predict(model, email_graph, test_id, device)
print("test acc: {:.4f}".format(metrics.accuracy_score(labels[test_id], test_preds)))