summaryrefslogtreecommitdiff
path: root/code
diff options
context:
space:
mode:
authorunknown <[email protected]>2023-07-29 11:20:27 +0800
committerunknown <[email protected]>2023-07-29 11:20:27 +0800
commit7592577acc00163e98b45bba86ef76bd37f93854 (patch)
tree671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code
parent5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff)
reorganize
Diffstat (limited to 'code')
-rw-r--r--code/analysisDataset.py47
-rw-r--r--code/buildGraph.py592
-rw-r--r--code/buildGraph2.py106
-rw-r--r--code/buildGraphviz.py24
-rw-r--r--code/buildSubGraph.py72
-rw-r--r--code/doStatistics.py25
-rw-r--r--code/drawGraph.py81
-rw-r--r--code/edgeClassification.py268
-rw-r--r--code/edgeClassificationEmailGraph.py297
-rw-r--r--code/emailPreparation.py57
-rw-r--r--code/graphsage.py390
-rw-r--r--code/hunterGraph.py286
-rw-r--r--code/parseEml.py324
13 files changed, 2569 insertions, 0 deletions
diff --git a/code/analysisDataset.py b/code/analysisDataset.py
new file mode 100644
index 0000000..34c7a06
--- /dev/null
+++ b/code/analysisDataset.py
@@ -0,0 +1,47 @@
+from parseEml import parseEml
+import os
+
+def read_spf(email_path):
+ mail = parseEml(email_path)
+ auth_result = mail.get_auth_results()
+ if auth_result and "spf=" in auth_result:
+ tmp_list = auth_result.split("spf=")
+ spf_result=tmp_list[1].split(" ")[0]
+ else:
+ spf_result=None
+ return spf_result
+
+#输入:邮件文件夹路径
+#输出:spf认证结果统计dict
+def spf_count(email_folder):
+ spf_count_dict={}
+ files = os.listdir(email_folder)
+ print(email_folder+":\ntotal: "+str(len(files)))
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ spf_result=read_spf(email_folder+"/"+file)
+ if spf_result in spf_count_dict:
+ spf_count_dict[spf_result]+=1
+ else:
+ spf_count_dict[spf_result]=1
+ print(spf_count_dict)
+
+#输入:中间域名列表
+#输出:域名出现次数统计结果
+def inter_domain_count(inter_domain_file):
+ count_dict={}
+ with open(inter_domain_file,"r",encoding="utf-8") as f:
+ line = f.readline()
+ while line:
+ line=line.strip()
+ if line in count_dict:
+ count_dict[line]+=1
+ else:
+ count_dict[line]=1
+ line = f.readline()
+ a = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
+ print(a)
+
+if __name__ == "__main__":
+ inter_domain_count("inter_domain.txt") \ No newline at end of file
diff --git a/code/buildGraph.py b/code/buildGraph.py
new file mode 100644
index 0000000..8cb8802
--- /dev/null
+++ b/code/buildGraph.py
@@ -0,0 +1,592 @@
+from parseEml import parseEml
+import os
+import re
+from shutil import copyfile
+
+def extract_node_edge(email_path):
+ # 节点类型:0:sender domain;1:inter domain;2:IP;3:client
+ # 边类型:0:sender domain-inter domain;1:inter domain-IP或者IP-inter domain;
+ # 边类型:2:sender domain-message id domain; 3:sender domain-x_mailer; 4:sender domain-dkim domain
+ # inter_node_list[每跳域名、IP为一个set]
+ node_list=set()
+ edge_list=[]
+ mail=parseEml(email_path)
+ raw_node_list=mail.get_from_host_list()
+ raw_node_list.insert(0,mail.get_from())
+ # print(raw_node_list)
+ inter_node_list=[]
+ sender_domain=None
+ for node in raw_node_list:
+ if '@' in node:
+ node=node.split('@')[-1]
+ if '>' in node:
+ node=node.split(">")[0]
+ if ')' in node:
+ node = node.split(")")[0]
+ if ',' in node:
+ node=node.replace(","," ")
+ sender_domain=node
+ # if "kennadi" in sender_domain:
+ # print(email_path)
+ node_list.add(node+",0")
+ else:
+ inter_domain_ip=set()
+ inter_nodes=re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+',node)
+ # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
+ if len(inter_nodes)!=0:
+ for inter_node in inter_nodes:
+ if len(re.findall(r'[-a-zA-Z]',inter_node)):
+ domain_sets = inter_node.split(".")
+ if len(domain_sets) > 2:
+ inter_node = ".".join(domain_sets[1:])
+ if sender_domain:
+ edge_list.append(sender_domain+","+inter_node+",0")#发件域到中间域的边
+ inter_domain_ip.add(inter_node)
+ node_list.add(inter_node + ",1")
+ else:
+ inter_domain_ip.add(inter_node)
+ node_list.add(inter_node+",2")
+ if len(inter_domain_ip):
+ inter_node_list.append(inter_domain_ip)
+ # print(node_list)
+ print(sender_domain)
+ print(inter_node_list)
+ for domain_ip_set in inter_node_list:
+ if len(domain_ip_set) > 1:
+ domain_ip_list=list(domain_ip_set)
+ for i in range(0,len(domain_ip_list)-1):
+ for j in range(i+1,len(domain_ip_list)):
+ edge_list.append(domain_ip_list[i]+","+domain_ip_list[j]+",1")
+ print(edge_list)
+ return node_list,edge_list
+
+def extract_sender_and_received(email_folder,node_file,edge_file):
+ with open(node_file, 'a+') as f:
+ f.write('index,name,type\n')
+ with open(edge_file,'a+') as edge_f:
+ edge_f.write('node1,node2,type\n')
+ node_list=set()
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ nodes,edges=extract_node_edge(email_folder + "/" + file)
+ node_list.update(nodes)
+ with open(edge_file,'a+') as edge_f:
+ for edge in edges:
+ edge_f.write(edge+"\n")
+ with open(node_file, 'a+',encoding="utf-8") as f:
+ i=0
+ for node in node_list:
+ node=str(i)+','+node
+ f.write(node+"\n")
+ i+=1
+
+def extract_domain_from_address(fromname):
+ sender_domain=None
+ if '@' in fromname:
+ sender_domain = fromname.split('@')[-1]
+ if '>' in sender_domain:
+ sender_domain = sender_domain.split(">")[0]
+ if ')' in sender_domain:
+ sender_domain = sender_domain.split(")")[0]
+ if ',' in sender_domain:
+ sender_domain = sender_domain.replace(",", " ")
+ return sender_domain
+
+def add_message_id_edge(email_folder,edge_file):
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ mail = parseEml(email_folder+"/"+file)
+ fromname = mail.get_from()
+ message_id=mail.get_message_id()
+ if '@' in fromname:
+ sender_domain = fromname.split('@')[-1]
+ if '>' in sender_domain:
+ sender_domain = sender_domain.split(">")[0]
+ if ')' in sender_domain:
+ sender_domain = sender_domain.split(")")[0]
+ if message_id != None:
+ message_id_domain=message_id.split('@')[-1]
+ message_id_domain=message_id_domain.split(">")[0]
+ if sender_domain != message_id_domain and sender_domain:
+ with open(edge_file, 'a+',encoding='utf-8') as edge_f:
+ edge_f.write(sender_domain+","+message_id_domain+",2\n")
+
+def add_x_mailer_edge(email_folder,edge_file):
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ mail = parseEml(email_folder+"/"+file)
+ fromname = mail.get_from()
+ x_mailer=mail.get_x_mailer()
+ if x_mailer:
+ x_mailer=x_mailer.replace("\n","")
+ x_mailer=x_mailer.replace(",","")
+ if '@' in fromname:
+ sender_domain = fromname.split('@')[-1]
+ if '>' in sender_domain:
+ sender_domain = sender_domain.split(">")[0]
+ if ')' in sender_domain:
+ sender_domain = sender_domain.split(")")[0]
+ if x_mailer != None and sender_domain:
+ with open(edge_file, 'a+',encoding="utf-8") as edge_f:
+ edge_f.write(sender_domain+","+x_mailer+",3\n")
+
+def add_dkim_edge(email_folder,edge_file):
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ mail = parseEml(email_folder+"/"+file)
+ fromname = mail.get_from()
+ dkim_signature=mail.get_dkim()
+ if dkim_signature:
+ dkim_signature=dkim_signature.replace("\n\t","")
+ dkim_domains=re.findall(r'd=(.+?);',dkim_signature)
+ if len(dkim_domains)==0:
+ # dkim_domain=dkim_domains[0]
+ print(dkim_signature)
+ else:
+ dkim_domain=dkim_domains[0]
+ if '@' in fromname:
+ sender_domain = fromname.split('@')[-1]
+ if '>' in sender_domain:
+ sender_domain = sender_domain.split(">")[0]
+ if ')' in sender_domain:
+ sender_domain = sender_domain.split(")")[0]
+ if sender_domain and sender_domain != dkim_domain:
+ with open(edge_file, 'a+', encoding="utf-8") as edge_f:
+ edge_f.write(sender_domain + "," + dkim_domain + ",4\n")
+
+
+import csv
+def add_nodes(node_file,edge_file,new_node_file):
+ nodes_set=set()
+ # 逐行读取csv文件
+ with open(node_file, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ nodes_set.add(node["name"]+","+node["type"])
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges=csv.DictReader(edgefile)
+ for edge in edges:
+ if edge["type"]=='2' or edge["type"]=='4':
+ nodes_set.add(edge["node2"]+","+str(1))
+ else:
+ nodes_set.add(edge["node2"]+","+str(3))
+ with open(new_node_file, 'a+',encoding="utf-8") as f:
+ f.write('index,name,type\n')
+ i = 0
+ for new_node in nodes_set:
+ new_node = str(i) + ',' + new_node
+ f.write(new_node + "\n")
+ i += 1
+
+def is_ip(str):
+ domain_and_ip=re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+',str)
+ domain=re.findall(r'[-a-zA-Z-]',str)
+ if len(domain_and_ip) and (len(domain)==0):
+ nums=str.split(".")
+ if len(nums)==4:
+ return True
+ return False
+
+import pandas as pd
+def nodes_to_index(node_file,edge_file,new_edge_file):
+ nodes=pd.read_csv(node_file,encoding='utf-8')
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ if edge['type']=='0':
+ print("hi:"+edge['node1'])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0]
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0]
+ elif edge['type']=='1':#注意区分域名和IP
+ if is_ip(edge['node1']):
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==2)].index.tolist()[0]
+ else:
+ print(edge["node1"])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==1)].index.tolist()[0]
+ if is_ip(edge['node2']):
+ print(edge["node2"])
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==2)].index.tolist()[0]
+ else:
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0]
+ elif edge['type']=='2' or edge['type'] == '4':
+ node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0]
+ node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 1)].index.tolist()[0]
+ elif edge['type']=='3':
+ node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0]
+ node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0]
+ edge_list.append(str(node1_index)+","+str(node2_index)+","+str(edge['type']))
+ with open(new_edge_file, 'a+', encoding="utf-8") as f:
+ for new_edge in edge_list:
+ f.write(new_edge + "\n")
+
+def nodes_to_index_mes_id(node_file,edge_file,new_edge_file):
+ nodes=pd.read_csv(node_file,encoding='utf-8')
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ print(edge["node1"])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0]
+ if edge['type']=='2' or edge['type'] == '4':
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0]
+ elif edge['type']=='3':
+ node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0]
+ edge_list.append(str(node1_index)+","+str(node2_index)+","+str(edge['type']))
+ with open(new_edge_file, 'w', encoding="utf-8") as f:
+ for new_edge in edge_list:
+ f.write(new_edge + "\n")
+
+# 使用 graphviz创建元图.
+# import pygraphviz as pgv
+import json
+def plot_graph(node_file,edge_file_fraud,edge_file_legi):
+ ag = pgv.AGraph(strict=False, directed=False,rankdir="LR")
+ with open(edge_file_fraud, 'r', encoding="utf-8") as edgefile_fraud:
+ reader=csv.reader(edgefile_fraud)
+ edges_fraud=[" ".join(row) for row in reader]
+ edge_count_fraud=pd.value_counts(edges_fraud).to_dict()
+ with open(edge_file_legi, 'r', encoding="utf-8") as edgefile_legi:
+ reader1=csv.reader(edgefile_legi)
+ edges_legi=[" ".join(row) for row in reader1]
+ edge_count_legi=pd.value_counts(edges_legi).to_dict()
+ with open(node_file, 'r', encoding="utf-8") as nodefile:
+ nodes = csv.DictReader(nodefile)
+ for node in nodes:
+ if node["type"] == '0':
+ ag.add_node(node["index"], label=node["name"], shape="box", color="blue")
+ # ag.add_node(node["index"], shape="box",color="blue")
+ # ag.add_node(node["index"], shape="point", color="blue")
+ elif node["type"] == '1':
+ ag.add_node(node["index"], label=node["name"], shape="ellipse")
+ # ag.add_node(node["index"], shape="ellipse")
+ # ag.add_node(node["index"], shape="point",color="green")
+ elif node["type"] == '2':
+ ag.add_node(node["index"], shape="point")
+ else:
+ ag.add_node(node["index"], label=node["name"], shape="diamond")
+ # ag.add_node(node["index"], shape="diamond")
+ # ag.add_node(node["index"], shape="point", color="purple")
+ for key in edge_count_fraud:
+ edge_param=key.split(" ")
+ ag.add_edge(edge_param[0],edge_param[1],label=edge_count_fraud[key],color="red")
+ for key in edge_count_legi:
+ edge_param=key.split(" ")
+ ag.add_edge(edge_param[0], edge_param[1], label=edge_count_legi[key])
+ ag.layout('dot')
+ ag.draw('graph_dot.svg')
+
+def select_legi_emails(email_folder):
+ files = os.listdir(email_folder)
+ i=0
+ for file in files: # 遍历文件夹
+ if i<2483:
+ copyfile(email_folder + "/" + file,"datacon_1_legi_train/"+file)
+ if i>=2483 and i < 3725:
+ copyfile(email_folder + "/" + file,"datacon_1_legi_val/"+file)
+ if i>=3725:
+ copyfile(email_folder + "/" + file,"datacon_1_legi_test/"+file)
+ i += 1
+
+def merge_node(node_file1,node_file2,new_node_file):
+ #合并两个node文件,统一索引
+ nodes_set = set()
+ # 逐行读取csv文件
+ with open(node_file1, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ nodes_set.add(node["name"] + "," + node["type"])
+ with open(node_file2, 'r', encoding="utf-8") as nodefile2:
+ nodes2 = csv.DictReader(nodefile2)
+ for node2 in nodes2:
+ nodes_set.add(node2["name"] + "," + node2["type"])
+ with open(new_node_file, 'a+', encoding="utf-8") as f:
+ f.write('index,name,type\n')
+ i = 0
+ for new_node in nodes_set:
+ new_node = str(i) + ',' + new_node
+ f.write(new_node + "\n")
+ i += 1
+
+import json
+
+def _str2tuple(key):
+ # 注意python切片 左开右闭 的性质
+ fore = int(key[1:2])
+ back = key[5: -2]
+ return tuple([fore, back])
+
+
+def one_email_to_graph(email_path,node_file,edge_file):
+ with open(node_file, 'r',encoding='UTF-8') as node_f:
+ node_dict = json.load(node_f)
+ node_dict = json.loads(node_dict)
+ node_dict = {_str2tuple(k): node_dict[k] for k in node_dict}
+ edge_list = []
+ mail = parseEml(email_path)
+ raw_node_list = mail.get_from_host_list()
+ raw_node_list.insert(0, mail.get_from())
+ # print(raw_node_list)
+ inter_node_list = []
+ sender_domain = None
+ for node in raw_node_list:
+ if '@' in node:
+ node = node.split('@')[-1]
+ if '>' in node:
+ node = node.split(">")[0]
+ if ')' in node:
+ node = node.split(")")[0]
+ if ',' in node:
+ node = node.replace(",", " ")
+ sender_domain = node
+ if "monkey.org\n" in sender_domain:
+ print(email_path)
+ if (0,node) not in node_dict:
+ node_dict[(0,node)]=len(node_dict)
+ else:
+ inter_domain_ip = set()
+ inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node)
+ # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
+ if len(inter_nodes) != 0:
+ for inter_node in inter_nodes:
+ if len(re.findall(r'[-a-zA-Z]', inter_node)):
+ domain_sets = inter_node.split(".")
+ if len(domain_sets) > 2:
+ inter_node = ".".join(domain_sets[1:])
+ if (1, inter_node) not in node_dict:
+ node_dict[(1, inter_node)] = len(node_dict)
+ if sender_domain:
+ edge_list.append(str(node_dict[(0,sender_domain)]) + "," + str(node_dict[(1,inter_node)]) + ",0") # 发件域到中间域的边
+ inter_domain_ip.add((1,inter_node))
+ else:
+ inter_domain_ip.add((2,inter_node))
+ if (2, inter_node) not in node_dict:
+ node_dict[(2, inter_node)] = len(node_dict)
+ if len(inter_domain_ip):
+ inter_node_list.append(inter_domain_ip)
+ # print(node_list)
+ # print(sender_domain)
+ # print(inter_node_list)
+ for domain_ip_set in inter_node_list:
+ if len(domain_ip_set) > 1:
+ domain_ip_list = list(domain_ip_set)
+ for i in range(0, len(domain_ip_list) - 1):
+ for j in range(i + 1, len(domain_ip_list)):
+ edge_list.append(str(node_dict[domain_ip_list[i]]) + "," + str(node_dict[domain_ip_list[j]]) + ",1")
+ print(edge_list)
+
+ #message-id
+ message_id = mail.get_message_id()
+ if message_id != None:
+ message_id_domain = message_id.split('@')[-1]
+ message_id_domain = message_id_domain.split(">")[0]
+ if sender_domain != message_id_domain and sender_domain:
+ if (1,message_id_domain ) not in node_dict:
+ node_dict[(1, message_id_domain)] = len(node_dict)
+ edge_list.append(str(node_dict[(0,sender_domain)]) + "," + str(node_dict[(1,message_id_domain)]) + ",2")
+
+ #x-mailer
+ x_mailer = mail.get_x_mailer()
+ if x_mailer:
+ x_mailer = x_mailer.replace("\n", "")
+ x_mailer = x_mailer.replace(",", "")
+ if x_mailer != None and sender_domain:
+ if (3, x_mailer) not in node_dict:
+ node_dict[(3, x_mailer)] = len(node_dict)
+ edge_list.append(str(node_dict[(0, sender_domain)]) + "," + str(node_dict[(3, x_mailer)]) + ",3")
+
+ #dkim-domain
+ dkim_signature = mail.get_dkim()
+ if dkim_signature:
+ dkim_signature = dkim_signature.replace("\n\t", "")
+ dkim_domains = re.findall(r'd=(.+?);', dkim_signature)
+ if len(dkim_domains) == 0:
+ # dkim_domain=dkim_domains[0]
+ print(dkim_signature)
+ else:
+ dkim_domain = dkim_domains[0]
+ if sender_domain and sender_domain != dkim_domain:
+ if (1, dkim_domain) not in node_dict:
+ node_dict[(1, dkim_domain)] = len(node_dict)
+ edge_list.append(str(node_dict[(0, sender_domain)]) + "," + str(node_dict[(1, dkim_domain)]) + ",4")
+
+ with open(node_file, 'w', encoding="utf-8") as f:
+ node_dict=json.dumps({str(k):node_dict[k] for k in node_dict})
+ json.dump(node_dict,f)
+ with open(edge_file,'a+',encoding="utf-8") as edge_f:
+ for edge in edge_list:
+ edge_f.writelines(edge)
+ edge_f.writelines("\n")
+
+def email_batch_to_graph(email_folder,node_file,edge_file):
+ node_list = set()
+ with open(node_file, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ node_list.add(node["name"]+","+node["type"])
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ nodes, edges = one_email_to_edges(email_folder + "/" + file)
+ node_list.update(nodes)
+ with open(edge_file, 'a+', encoding="utf-8") as edge_f:
+ for edge in edges:
+ edge_f.write(edge + "\n")
+ with open(node_file, 'w', encoding="utf-8") as f:
+ f.write("index,name,type\n")
+ i = 0
+ for node in node_list:
+ node = str(i) + ',' +node
+ f.write(node + "\n")
+ i += 1
+
+def one_email_to_edges(email_path):
+ node_set=set()
+ edge_list = []
+ mail = parseEml(email_path)
+ raw_node_list = mail.get_from_host_list()
+ if raw_node_list == None:
+ raw_node_list=[]
+ if mail.get_from() != None:
+ # print(mail.get_from())
+ raw_node_list.insert(0, mail.get_from())
+ # print(raw_node_list)
+ inter_node_list = []
+ sender_domain = None
+ for node in raw_node_list:
+ if '@' in node:
+ node = node.split('@')[-1]
+ if '>' in node:
+ node = node.split(">")[0]
+ if ')' in node:
+ node = node.split(")")[0]
+ if ',' in node:
+ node = node.replace(",", " ")
+ if '\n' in node:
+ node = node.replace("\n"," ")
+ sender_domain = node
+ # if "\n" in sender_domain:
+ # print(email_path)
+ node_set.add(node+",0")
+ else:
+ inter_domain_ip = set()
+ inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node)
+ # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
+ if len(inter_nodes) != 0:
+ for inter_node in inter_nodes:
+ if len(re.findall(r'[-a-zA-Z]', inter_node)):
+ domain_sets = inter_node.split(".")
+ if len(domain_sets) > 2:
+ inter_node = ".".join(domain_sets[1:])
+ node_set.add(inter_node+",1")
+ if sender_domain:
+ edge_list.append(sender_domain + "," + inter_node + ",0") # 发件域到中间域的边
+ inter_domain_ip.add((1,inter_node))
+ else:
+ inter_domain_ip.add((2,inter_node))
+ node_set.add(inter_node+",2")
+ if len(inter_domain_ip):
+ inter_node_list.append(inter_domain_ip)
+ # print(node_list)
+ # print(sender_domain)
+ # print(inter_node_list)
+ for domain_ip_set in inter_node_list:
+ if len(domain_ip_set) > 1:
+ domain_ip_list = list(domain_ip_set)
+ for i in range(0, len(domain_ip_list) - 1):
+ for j in range(i + 1, len(domain_ip_list)):
+ edge_list.append(domain_ip_list[i][1] + "," + domain_ip_list[j][1] + ",1")
+ # print(edge_list)
+
+ #message-id
+ message_id = mail.get_message_id()
+ if message_id != None:
+ message_id_domain = message_id.split('@')[-1]
+ message_id_domain = message_id_domain.split(">")[0]
+ if sender_domain != message_id_domain and sender_domain:
+ node_set.add(message_id_domain+",1")
+ edge_list.append(sender_domain + "," + message_id_domain + ",2")
+
+ #x-mailer
+ x_mailer = mail.get_x_mailer()
+ if x_mailer:
+ x_mailer = x_mailer.replace("\n", "")
+ x_mailer = x_mailer.replace(",", "")
+ if x_mailer != None and sender_domain:
+ node_set.add(x_mailer+",3")
+ edge_list.append(sender_domain + "," + x_mailer + ",3")
+
+ #dkim-domain
+ dkim_signature = mail.get_dkim()
+ if dkim_signature:
+ dkim_signature = dkim_signature.replace("\n\t", "")
+ dkim_domains = re.findall(r'd=(.+?);', dkim_signature)
+ if len(dkim_domains) == 0:
+ # dkim_domain=dkim_domains[0]
+ print(dkim_signature)
+ else:
+ dkim_domain = dkim_domains[0]
+ if sender_domain and sender_domain != dkim_domain:
+ node_set.add(dkim_domain+",1")
+ edge_list.append(sender_domain + "," + dkim_domain + ",4")
+ return node_set,edge_list
+
+def split_training_nodes(node_file,edge_file):
+ node_dataframe_all=pd.read_csv(node_file,encoding="utf-8")
+ edge_dataframe_all=pd.read_csv(edge_file,encoding="utf-8")
+ nodes_list=edge_dataframe_all["node1"].tolist()
+ nodes_list+=edge_dataframe_all["node2"].tolist()
+ nodes_set=set(nodes_list)
+ print(len(nodes_set))
+ training_nodes=node_dataframe_all[node_dataframe_all["index"].isin(nodes_list)]
+ # training_nodes.to_csv("training_nodes.csv",index=False)
+
+def add_testing_nodes(node_file1,node_file2,added_nodes_file):
+ nodes_set = set()
+ new_node_dict={}
+ # 逐行读取csv文件
+ with open(node_file1, 'r', encoding="utf-8") as nodefile1:
+ nodes = csv.DictReader(nodefile1)
+ for node in nodes:
+ training_node=node["name"] + "," + node["type"]
+ # if training_node in nodes_set:
+ # print(training_node)
+ nodes_set.add(training_node)
+ with open(node_file2, 'r', encoding="utf-8") as nodefile2:
+ nodes2 = csv.DictReader(nodefile2)
+ for node2 in nodes2:
+ test_node=node2["name"]+","+node2["type"]
+ if test_node in nodes_set:
+ continue
+ new_node_dict[len(nodes_set)]=test_node
+ nodes_set.add(test_node)
+ with open(added_nodes_file, 'w', encoding="utf-8") as f:
+ f.write("index,name,type\n")
+ for key in new_node_dict:
+ node = str(key) + ',' +new_node_dict[key]
+ f.write(node + "\n")
+
+if __name__ == "__main__":
+ # select_legi_emails("datacon_1_legitimate")
+ # extract_sender_and_received("datacon_1_fraud","datacon_fraud_graph/nodes.csv","datacon_fraud_graph/edges.csv")
+ # add_message_id_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv")
+ # add_x_mailer_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv")
+ # add_dkim_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv")
+ # add_nodes("datacon_fraud_graph/nodes.csv","datacon_fraud_graph/edges_other.csv","datacon_fraud_graph/nodes_all.csv")
+ # merge_node("datacon_legitimate_graph/nodes_all.csv","all_nodes.csv","all_nodes1.csv")
+ # nodes_to_index("all_nodes.csv","legi_edges_testing.csv","legi_edges_testing_index_only.csv")
+ # nodes_to_index_mes_id("all_nodes.csv","datacon_legitimate_graph/edges_other.csv","datacon_legitimate_graph/edges_index_only.csv")
+ # plot_graph("all_nodes.csv","fraud_edges_index_only.csv","datacon_legitimate_graph/legi_edges_index_only.csv")
+ # one_email_to_graph("nazario_phishing_2020/2.eml","all_nodes.json","all_edges.csv")
+ email_batch_to_graph("benign_emails","all_nodes1.csv","benign_edges.csv")
+ # split_training_nodes("all_nodes.csv","edges_training_index_only.csv")
+ # add_testing_nodes("training_nodes1.csv","testing_nodes.csv","indexed_testing_nodes.csv") \ No newline at end of file
diff --git a/code/buildGraph2.py b/code/buildGraph2.py
new file mode 100644
index 0000000..b994ec8
--- /dev/null
+++ b/code/buildGraph2.py
@@ -0,0 +1,106 @@
+import re
+from parseEml import parseEml
+import csv,os
+
+def one_email_to_edges(email_path,email_num):
+ node_set=set()
+ edge_list = []
+ mail = parseEml(email_path)
+ raw_node_list = mail.get_from_host_list()
+ if raw_node_list == None:
+ raw_node_list=[]
+ raw_node_list.append("email"+str(email_num))
+ email_node="email"+str(email_num)
+ node_set.add(email_node+",0")
+ other_node_set=set()
+ inter_node_list = []
+ for node in raw_node_list:
+ inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node)
+ # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
+ if len(inter_nodes) != 0:
+ for inter_node in inter_nodes:
+ if len(re.findall(r'[-a-zA-Z]', inter_node)):
+ domain_sets = inter_node.split(".")
+ if len(domain_sets) > 2:
+ inter_node = ".".join(domain_sets[1:])
+ node_set.add(inter_node+",1")
+ other_node_set.add(inter_node)
+ else:
+ node_set.add(inter_node+",2")
+ other_node_set.add(inter_node)
+
+ #message-id
+ message_id = mail.get_message_id()
+ if message_id != None:
+ message_id_domain = message_id.split('@')[-1]
+ message_id_domain = message_id_domain.split(">")[0]
+ node_set.add(message_id_domain+",1")
+ other_node_set.add(message_id_domain)
+
+ #x-mailer
+ x_mailer = mail.get_x_mailer()
+ if x_mailer:
+ x_mailer = x_mailer.replace("\n", "")
+ x_mailer = x_mailer.replace(",", "")
+ if x_mailer != None:
+ node_set.add(x_mailer+",3")
+ other_node_set.add(x_mailer)
+
+ #dkim-domain
+ dkim_signature = mail.get_dkim()
+ if dkim_signature:
+ dkim_signature = dkim_signature.replace("\n\t", "")
+ dkim_domains = re.findall(r'd=(.+?);', dkim_signature)
+ if len(dkim_domains) == 0:
+ # dkim_domain=dkim_domains[0]
+ print(dkim_signature)
+ else:
+ dkim_domain = dkim_domains[0]
+ node_set.add(dkim_domain+",1")
+ other_node_set.add(dkim_domain)
+ for other_node in other_node_set:
+ edge_list.append(email_node + "," + other_node)
+ return node_set,edge_list
+
+def email_batch_to_graph(email_folder,node_file,edge_file):
+ node_list = set()
+ with open(node_file, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ node_list.add(node["name"]+","+node["type"])
+ files = os.listdir(email_folder)
+ email_num=2010
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ nodes, edges = one_email_to_edges(email_folder + "/" + file,email_num)
+ node_list.update(nodes)
+ with open(edge_file, 'a+', encoding="utf-8") as edge_f:
+ for edge in edges:
+ edge_f.write(edge + "\n")
+ email_num+=1
+ with open(node_file, 'w', encoding="utf-8") as f:
+ f.write("index,name,type\n")
+ i = 0
+ for node in node_list:
+ node = str(i) + ',' +node
+ f.write(node + "\n")
+ i += 1
+
+import pandas as pd
+def nodes_to_index(node_file,edge_file,new_edge_file):
+ nodes=pd.read_csv(node_file,encoding='utf-8')
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ node1_index=nodes[(nodes['name']==edge['node1'])].index.tolist()[0]
+ node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
+ edge_list.append(str(node1_index)+","+str(node2_index))
+ with open(new_edge_file, 'a+', encoding="utf-8") as f:
+ for new_edge in edge_list:
+ f.write(new_edge + "\n")
+
+if __name__=="__main__":
+ # email_batch_to_graph("datacon_1_legitimate","hunter_node.csv","hunter_edge.csv")
+ nodes_to_index("hunter_node.csv","hunter_edge.csv","hunter_edge_index_only.csv") \ No newline at end of file
diff --git a/code/buildGraphviz.py b/code/buildGraphviz.py
new file mode 100644
index 0000000..cf43159
--- /dev/null
+++ b/code/buildGraphviz.py
@@ -0,0 +1,24 @@
+from graphviz import Digraph
+
+# 实例化一个Digraph对象(有向图),name:生成的图片的图片名,format:生成的图片格式
+dot = Digraph(name="MyPicture", comment="the test", format="png")
+
+# 生成图片节点,name:这个节点对象的名称,label:节点名,color:画节点的线的颜色
+dot.node(name='a', label='Ming', color='green')
+dot.node(name='b', label='Hong', color='yellow')
+dot.node(name='c', label='Dong')
+
+# 在节点之间画线,label:线上显示的文本,color:线的颜色
+dot.edge('a', 'b', label="ab\na-b", color='red')
+# 一次性画多条线,c到b的线,a到c的线
+dot.edges(['cb', 'ac'])
+
+# 打印生成的源代码
+print(dot.source)
+
+# 画图,filename:图片的名称,若无filename,则使用Digraph对象的name,默认会有gv后缀
+# directory:图片保存的路径,默认是在当前路径下保存
+dot.view(filename="mypicture")
+
+# 跟view一样的用法(render跟view选择一个即可),一般用render生成图片,不使用view=True,view=True用在调试的时候
+# dot.render(filename='MyPicture') \ No newline at end of file
diff --git a/code/buildSubGraph.py b/code/buildSubGraph.py
new file mode 100644
index 0000000..e68cc82
--- /dev/null
+++ b/code/buildSubGraph.py
@@ -0,0 +1,72 @@
+from buildGraph import one_email_to_edges,is_ip
+import csv
+import os
+import pandas as pd
+
+def nodes_to_index(node_file,edge_file,new_edge_file):
+ print(str(new_edge_file))
+ nodes=pd.read_csv(node_file,encoding='utf-8')
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ if edge['type']=='0':
+ print("hi:"+edge['node1'])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0]
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0]
+ elif edge['type']=='1':#注意区分域名和IP
+ if is_ip(edge['node1']):
+ print(edge['node1'])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==2)].index.tolist()[0]
+ else:
+ print(edge["node1"])
+ node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==1)].index.tolist()[0]
+ if is_ip(edge['node2']):
+ print(edge["node2"])
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==2)].index.tolist()[0]
+ else:
+ node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0]
+ elif edge['type']=='2' or edge['type'] == '4':
+ node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0]
+ node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 1)].index.tolist()[0]
+ elif edge['type']=='3':
+ node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0]
+ node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0]
+ edge_list.append(str(node1_index)+"\t"+str(node2_index))
+ with open(new_edge_file, 'w', encoding="utf-8") as f:
+ for new_edge in edge_list:
+ f.writelines(new_edge + "\n")
+
+def email_batch_to_subgraph(email_folder,node_file,graph_folder):
+ # node_dict = {}
+ # with open(node_file, 'r', encoding="utf-8") as csvfile:
+ # nodes = csv.DictReader(csvfile)
+ # for node in nodes:
+ # node_dict[node["name"]+","+node["type"]]=node["index"]
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ nodes, edges = one_email_to_edges(email_folder + "/" + file)
+ with open(graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv", 'w', encoding="utf-8") as edge_f:
+ edge_f.write("node1,node2,type\n")
+ for edge in edges:
+ edge_f.write(edge + "\n")
+ nodes_to_index(node_file,graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv",graph_folder+"/"+email_folder+"_"+file.replace(".eml",''))
+ os.remove(graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv")
+
+def find_differ_inter_domain(email_folder,inter_domain_file):
+ files = os.listdir(email_folder)
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ nodes, edges = one_email_to_edges(email_folder + "/" + file)
+ for edge in edges:
+ edge_part=edge.split(",")
+ if(edge_part[2]==("0" or "2" or "4")):
+ if(edge_part[0]!=edge_part[1]):
+ with open(inter_domain_file,'a+',encoding="utf-8") as f:
+ f.write(edge_part[1]+"\n");
+
+if __name__=="__main__":
+ find_differ_inter_domain("nazario_phishing_2021","inter_domain.txt") \ No newline at end of file
diff --git a/code/doStatistics.py b/code/doStatistics.py
new file mode 100644
index 0000000..ef455f5
--- /dev/null
+++ b/code/doStatistics.py
@@ -0,0 +1,25 @@
+import csv
+import pandas as pd
+
+def edge_count(edge_file):
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ reader=csv.reader(edgefile)
+ edges_fraud=[" ".join(row) for row in reader]
+ edge_count_fraud=pd.value_counts(edges_fraud).to_dict()
+
+def count_multi_set_nodes(node_file1,node_file2):
+ # 合并两个node文件,统一索引
+ nodes_set = set()
+ # 逐行读取csv文件
+ with open(node_file1, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ nodes_set.add(node["name"] + "," + node["type"])
+ with open(node_file2, 'r', encoding="utf-8") as nodefile2:
+ nodes2 = csv.DictReader(nodefile2)
+ for node2 in nodes2:
+ nodes_set.add(node2["name"] + "," + node2["type"])
+ print(len(nodes_set))
+
+if __name__ == "__main__":
+ count_multi_set_nodes("datacon_fraud_graph/nodes_all.csv","nazario_2021_graph/nodes_all.csv") \ No newline at end of file
diff --git a/code/drawGraph.py b/code/drawGraph.py
new file mode 100644
index 0000000..5650145
--- /dev/null
+++ b/code/drawGraph.py
@@ -0,0 +1,81 @@
+import networkx as nx
+import csv
+import pandas as pd
+import dgl
+
+def read_graph_networkx(node_file,edge_file):
+ G = nx.Graph()
+ node_list=[]
+ edge_list=[]
+ with open(node_file, 'r', encoding="utf-8") as csvfile:
+ nodes = csv.DictReader(csvfile)
+ for node in nodes:
+ node_list.append((node["index"],{"label":node["name"]}))
+ G.add_nodes_from(node_list)
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ reader1 = csv.reader(edgefile)
+ edges = [" ".join(row) for row in reader1]
+ edge_count = pd.value_counts(edges).to_dict()
+ for key in edge_count:
+ edge_param=key.split(" ")
+ edge_list.append((edge_param[0],edge_param[1]))
+ G.add_edges_from(edge_list)
+ print(G)
+
+ # nx.draw(G,font_size=20)
+ # plt.show()
+
+ subgraphs=nx.connected_components(G)
+ for c in sorted(subgraphs,key=len,reverse=True):
+ if len(c)<=10:
+ break
+ print(G.subgraph(c))
+
+# nx.draw(largest_connected_subgraph,with_labels=True)
+
+def read_graph_dgl(node_file,edge_file_fraud,edge_file_legi):
+ graph_dict={}
+ node_dict={}
+ with open(node_file, 'r', encoding="utf-8") as nodefile:
+ nodes = csv.DictReader(nodefile)
+ for node in nodes:
+ if node["type"] == '0':
+ node_dict[node["index"]]="sender_domain"
+ elif node["type"] == '1':
+ node_dict[node["index"]]="inter_domain"
+ elif node["type"] == '2':
+ node_dict[node["index"]]="IP"
+ else:
+ node_dict[node["index"]]="client"
+
+ with open(edge_file_fraud, 'r', encoding="utf-8") as edgefile:
+ reader1 = csv.reader(edgefile)
+ edges = [" ".join(row) for row in reader1]
+ edge_count = pd.value_counts(edges).to_dict()
+ for key in edge_count:
+ edge_param=key.split(" ")
+ if graph_dict[(node_dict[edge_param[0]],"fraud",node_dict[edge_param[1]])]:
+ graph_dict[(node_dict[edge_param[0]], "fraud", node_dict[edge_param[1]])].append((edge_param[0],edge_param[1]))
+ else:
+ graph_dict[(node_dict[edge_param[0]], "fraud", node_dict[edge_param[1]])]=[(edge_param[0],edge_param[1])]
+ # 字典的每个值都是一个元组的列表。
+ # 节点是从零开始的整数ID。 不同类型的节点ID具有单独的计数。
+ ratings = dgl.heterograph(graph_dict)
+
+import torch as th
+def test():
+ # 边 0->1, 0->2, 0->3, 1->3
+ u, v = th.tensor(["qq.com","113.108.11.234","qq.com"]), th.tensor(["qq.com", "qq.com","127.0.0.1"])
+ g = dgl.graph((u, v))
+ print(g) # 图中节点的数量是DGL通过给定的图的边列表中最大的点ID推断所得出的
+ # 获取节点的ID
+ print(g.nodes())
+ # 获取边的对应端点
+ print(g.edges())
+ # 获取边的对应端点和边ID
+ print(g.edges(form='all'))
+
+if __name__=="__main__":
+ # read_graph_dgl("all_nodes.csv","fraud_edges_index_only.csv","")
+ # read_graph_networkx("all_nodes.csv","all_edges_index_only.csv")
+ test() \ No newline at end of file
diff --git a/code/edgeClassification.py b/code/edgeClassification.py
new file mode 100644
index 0000000..abc7af3
--- /dev/null
+++ b/code/edgeClassification.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+"""edgeClassification.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6
+"""
+
+import dgl
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import dgl.function as fn
+import dgl.nn.pytorch as dglnn
+import time
+import argparse
+from dgl.data import *
+import tqdm
+from sklearn import metrics
+
+import warnings
+warnings.filterwarnings('ignore')
+
+src = np.random.randint(0, 100, 500)
+dst = np.random.randint(0, 100, 500)
+# 同时建立反向边
+edge_pred_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src])))
+# 建立点和边特征,以及边的标签
+edge_pred_graph.ndata['feature'] = th.randn(100, 10)
+edge_pred_graph.edata['feature'] = th.randn(1000, 10)
+edge_pred_graph.edata['label'] = th.randn(1000)
+# 进行训练、验证和测试集划分
+edge_pred_graph.edata['train_mask'] = th.zeros(1000, dtype=th.bool).bernoulli(0.6)
+
+import dgl.function as fn
+class DotProductPredictor(nn.Module):
+ def forward(self, graph, h):
+ # h是计算出的节点表示
+ with graph.local_scope():
+ graph.ndata['h'] = h
+ graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
+ return graph.edata['score']
+
+class TwoLayerGCN(nn.Module):
+ def __init__(self, in_dim, hid_dim, out_dim):
+ """两层的GCN模型"""
+ super().__init__()
+ self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True)
+ self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True)
+
+ def forward(self, blocks, x):
+ x = F.relu(self.conv1(blocks[0], x))
+ x = F.relu(self.conv2(blocks[1], x))
+ return x
+
+class SAGE(nn.Module):
+ def __init__(self, in_feats, hid_feats, out_feats):
+ super().__init__()
+ # 实例化SAGEConve,in_feats是输入特征的维度,out_feats是输出特征的维度,aggregator_type是聚合函数的类型
+ self.conv1 = dglnn.SAGEConv(
+ in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
+ self.conv2 = dglnn.SAGEConv(
+ in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')
+
+ def forward(self, graph, inputs):
+ # 输入是节点的特征
+ h = self.conv1(graph, inputs)
+ h = F.relu(h)
+ h = self.conv2(graph, h)
+ return h
+
+class Model(nn.Module):
+ def __init__(self, in_features, hidden_features, out_features):
+ super().__init__()
+ self.sage=SAGE(in_features,hidden_features,out_features)
+ self.pred = DotProductPredictor()
+ def forward(self, g, x):
+ h = self.sage(g,x)
+ return self.pred(g, h)
+
+node_features = edge_pred_graph.ndata['feature']
+edge_label = edge_pred_graph.edata['label']
+train_mask = edge_pred_graph.edata['train_mask']
+model = Model(10, 20, 5)
+opt = th.optim.Adam(model.parameters())
+for epoch in range(10):
+ pred = model(edge_pred_graph, node_features)
+ loss = ((pred[train_mask] - edge_label[train_mask]) ** 2).mean()
+ opt.zero_grad()
+ loss.backward()
+ opt.step()
+ print(loss.item())
+
+argparser = argparse.ArgumentParser("edge classification training")
+argparser.add_argument('--num-epochs', type=int, default=30)
+argparser.add_argument('--num-hidden', type=int, default=64)
+argparser.add_argument('--num-layers', type=int, default=2)
+argparser.add_argument('--fan-out', type=str, default='10,25') ## 第一层随机采样10个,第二层采样25个
+argparser.add_argument('--batch-size', type=int, default=2048)
+argparser.add_argument('--lr', type=float, default=0.005)
+argparser.add_argument('--dropout', type=float, default=0.5)
+argparser.add_argument('--num-workers', type=int, default=0,
+ help="Number of sampling processes. Use 0 for no extra process.")
+args = argparser.parse_args([])
+
+# load wordnet data
+data = WN18Dataset()
+n_classes = data.num_rels ## 关系数量,也就是边分类的分类数
+g = data[0]
+## 训练集、验证集、测试集
+train_edge_mask = g.edata.pop('train_edge_mask')
+val_edge_mask = g.edata.pop('valid_edge_mask')
+test_edge_mask = g.edata.pop('test_edge_mask')
+
+# Pack data
+data = train_edge_mask, val_edge_mask, test_edge_mask, n_classes, g
+print('\n', g)
+n_edges = g.num_edges() # 图中边的数量
+labels = g.edata['etype'] # 图中所有边的标签
+
+
+train_edge_mask, val_edge_mask, test_edge_mask, n_classes, g = data
+print('\n', train_edge_mask.sum(), val_edge_mask.sum(), test_edge_mask.sum())
+
+## train, valid, test 边的id列表
+train_eid = th.LongTensor(np.nonzero(train_edge_mask)).squeeze()
+val_eid = th.LongTensor(np.nonzero(val_edge_mask)).squeeze()
+test_eid = th.LongTensor(np.nonzero(test_edge_mask)).squeeze()
+print(train_eid.shape, val_eid.shape, test_eid.shape)
+
+# Create sampler
+sampler = dgl.dataloading.MultiLayerNeighborSampler(
+ [int(fanout) for fanout in args.fan_out.split(',')])
+dataloader = dgl.dataloading.EdgeDataLoader(
+ g, train_eid, sampler,
+ exclude='reverse_id', # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊”
+ # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
+ reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
+ batch_size=args.batch_size,
+ shuffle=True,
+ drop_last=False,
+ num_workers=args.num_workers)
+
+## For debug
+for dd in dataloader:
+ """分别是input_nodes, edge_subgraph,blocks"""
+ for xx in dd:
+ print(xx)
+ print()
+ break
+
+class Predictor(nn.Module):
+ """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出"""
+
+ def __init__(self, in_dim, num_classes):
+ super().__init__()
+ self.W = nn.Linear(2 * in_dim, num_classes)
+
+ def apply_edges(self, edges):
+ data = th.cat([edges.src['x'], edges.dst['x']], dim=-1)
+ return {'score': self.W(data)}
+
+ def forward(self, edge_subgraph, x):
+ with edge_subgraph.local_scope():
+ edge_subgraph.ndata['x'] = x
+ edge_subgraph.apply_edges(self.apply_edges)
+ return edge_subgraph.edata['score']
+
+
+class MyModel(nn.Module):
+ """主模型:结构比较清晰"""
+
+ def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes):
+ super().__init__()
+ self.node_emb = nn.Embedding(num_nodes, emb_dim)
+ self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim)
+ self.predictor = Predictor(out_dim, num_classes)
+
+ def forward(self, edge_subgraph, blocks, input_nodes):
+ x = self.node_emb(input_nodes)
+ x = self.gcn(blocks, x)
+ return self.predictor(edge_subgraph, x)
+
+#实例化模型,并定义预测函数
+device = th.device("cuda" if th.cuda.is_available() else "cpu")
+print("device: {}".format(device))
+
+model = MyModel(64, args.num_hidden, args.num_hidden, 18, g.num_nodes())
+model = model.to(device)
+loss_fcn = nn.CrossEntropyLoss() # 交叉熵损失
+optimizer = optim.Adam(model.parameters(), lr=args.lr)
+
+
+def predict(model, g, valid_eid, device):
+ # Create sampler(全采样)
+ sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
+ dataloader = dgl.dataloading.EdgeDataLoader(
+ g, valid_eid, sampler, exclude='reverse_id',
+ # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
+ reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
+ batch_size=args.batch_size,
+ shuffle=False,
+ drop_last=False,
+ num_workers=args.num_workers)
+
+ valid_preds = []
+ model.eval()
+ with th.no_grad():
+ for input_nodes, edges_subgraph, blocks in dataloader:
+ edges_subgraph = edges_subgraph.to(device)
+ blocks = [block.int().to(device) for block in blocks]
+ pred = model(edges_subgraph, blocks, input_nodes)
+ pred = pred.cpu().argmax(-1).numpy().tolist()
+ valid_preds.extend(pred)
+ return valid_preds
+
+#训练模型
+best_val_acc = 0 # 记录验证集上的最好效果
+patience = 0 # For early stopping
+
+# Training loop
+for epoch in range(args.num_epochs):
+ # Loop over the dataloader to sample the computation dependency graph as a list of
+ # blocks.
+ start_time = time.time()
+ all_loss = []
+ trn_label, trn_pred = [], []
+ n_batches = len(dataloader)
+
+ for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader):
+ edges_subgraph = edges_subgraph.to(device)
+ blocks = [block.to(device) for block in blocks]
+
+ # Compute loss and prediction
+ edge_preds = model(edges_subgraph, blocks, input_nodes)
+ loss = loss_fcn(edge_preds, edges_subgraph.edata['etype']) # or labels[edges_subgraph.edata['_ID']]
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ all_loss.append(loss.item())
+ trn_label.extend(edges_subgraph.edata['etype'].cpu().numpy().tolist())
+ trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist())
+
+ if (step + 1) % (n_batches // 10) == 0:
+ cur_acc = metrics.accuracy_score(trn_label, trn_pred)
+ print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format(
+ epoch + 1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time))
+
+ ## 验证集预测
+ val_preds = predict(model, g, val_eid, device)
+ val_acc = metrics.accuracy_score(labels[val_eid], val_preds)
+
+ if val_acc > best_val_acc:
+ best_val_acc = val_acc
+ patience = 0
+ th.save(model.state_dict(), "edge_cls_best_model.bin")
+ else:
+ patience += 1
+ print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format(
+ val_acc, best_val_acc, time.time() - start_time))
+
+ ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练
+ if patience > 3:
+ break \ No newline at end of file
diff --git a/code/edgeClassificationEmailGraph.py b/code/edgeClassificationEmailGraph.py
new file mode 100644
index 0000000..d5b4c80
--- /dev/null
+++ b/code/edgeClassificationEmailGraph.py
@@ -0,0 +1,297 @@
+# -*- coding: utf-8 -*-
+"""edgeClassification.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6
+"""
+
+# from google.colab import drive
+# drive.mount('/content/drive')
+#
+# !pip install dgl==0.6.1
+
+import dgl
+import numpy as np
+import pandas as pd
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import dgl.function as fn
+import dgl.nn.pytorch as dglnn
+import time
+import argparse
+from dgl.data import *
+import tqdm
+from sklearn import metrics
+
+import warnings
+warnings.filterwarnings('ignore')
+
+argparser = argparse.ArgumentParser("edge classification training")
+argparser.add_argument('--num-epochs', type=int, default=30)
+argparser.add_argument('--num-hidden', type=int, default=64)
+argparser.add_argument('--num-layers', type=int, default=2)
+argparser.add_argument('--fan-out', type=str, default='10,25') ## 第一层随机采样10个,第二层采样25个
+argparser.add_argument('--batch-size', type=int, default=2048)
+argparser.add_argument('--lr', type=float, default=0.005)
+argparser.add_argument('--dropout', type=float, default=0.5)
+argparser.add_argument('--num-workers', type=int, default=0,
+ help="Number of sampling processes. Use 0 for no extra process.")
+args = argparser.parse_args([])
+
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+
+nodes=pd.read_csv("training_nodes1.csv")
+node_names=np.array(nodes)[:,1]
+print(len(node_names))
+# Tokenization of each name
+tokenized_node_names = []
+for node_name in node_names:
+ tokenized_node_names.append(word_tokenize(node_name.lower()))
+print(tokenized_node_names)
+
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_node_names)]
+model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)
+'''
+vector_size = Dimensionality of the feature vectors.
+window = The maximum distance between the current and predicted word within a sentence.
+min_count = Ignores all words with total frequency lower than this.
+alpha = The initial learning rate.
+'''
+
+node_text_feature_list=[]
+for text in tokenized_node_names:
+ node_text_feature_list.append(model.infer_vector(text))
+print(node_text_feature_list)
+
+#将节点类型先归一化,然后添加到节点特征中
+node_types=np.array(nodes)[:,2]
+type_feature_list=[]
+for type_value in node_types:
+ type_value=int(type_value)
+ type_value=type_value/3 #min-max归一化
+ type_feature_list.append(type_value)
+print(type_feature_list)
+print(len(type_feature_list))
+
+for i in range(0,len(node_text_feature_list)):
+ node_text_feature_list[i]=np.append(node_text_feature_list[i],type_feature_list[i])
+node_feature_array=node_text_feature_list[0]
+for i in range(1,len(node_text_feature_list)):
+ node_feature_array=np.vstack((node_feature_array,node_text_feature_list[i]))
+print(node_feature_array)
+
+edge_classes = 2 ## 关系数量,也就是边分类的分类数
+fraud_edges=pd.read_csv("fraud_edges_training_index_only.csv")
+src=np.array(fraud_edges)[:,0]
+dst=np.array(fraud_edges)[:,1]
+legi_edges=pd.read_csv("legi_edges_training_index_only.csv")
+src=np.concatenate([src,np.array(legi_edges)[:,0]])
+dst=np.concatenate([dst,np.array(legi_edges)[:,1]])
+print(len(src))
+print(len(dst))
+# 同时建立反向边
+email_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src])))
+# 建立点和边特征,以及边的标签
+email_graph.ndata['feature'] = th.tensor(node_feature_array)
+print(email_graph.ndata['feature'])
+fraud_type_array=np.array(fraud_edges)[:,2]
+fraud_label_array=np.ones((len(fraud_type_array)))#欺诈为1
+legi_type_array=np.array(legi_edges)[:,2]
+type_array=np.concatenate([fraud_type_array,legi_type_array])
+legi_label_array=np.zeros((len(legi_type_array)))#良性为0
+label_array=np.concatenate([fraud_label_array,legi_label_array])
+#为反向边也加上type,label
+type_array=np.concatenate([type_array,type_array])
+label_array=np.concatenate([label_array,label_array])
+print(len(label_array))
+edge_feature_array=[]
+#边类型归一化
+for edge_type in type_array:
+ edge_type=edge_type/4
+ edge_feature_array.append(edge_type)
+# print(len(edge_feature_array))
+email_graph.edata['feature'] = th.tensor(edge_feature_array)
+print(email_graph.edata['feature'])
+email_graph.edata['label'] = th.LongTensor(label_array)
+print(email_graph.edata['label'])
+print(email_graph)
+
+# 随机进行训练、验证和测试集划分 6:2:2
+train_len=int(email_graph.number_of_edges() * 0.6)
+val_len=int(email_graph.number_of_edges() * 0.2)
+test_len=email_graph.number_of_edges()-train_len-val_len
+pre_train=np.zeros((train_len))
+pre_val=np.ones((val_len))
+pre_test=np.linspace(2,100,test_len)
+pre_split=np.concatenate([np.concatenate([pre_train,pre_val]),pre_test])
+print(pre_split)
+np.random.shuffle(pre_split)
+print(pre_split)
+train_id_list,val_id_list,test_id_list=[],[],[]
+for i in range(0,len(pre_split)):
+ if pre_split[i]==0:
+ train_id_list.append(i)
+ elif pre_split[i]==1:
+ val_id_list.append(i)
+ else:
+ test_id_list.append(i)
+train_id,val_id,test_id=th.tensor(train_id_list),th.tensor(val_id_list),th.tensor(test_id_list)
+print(train_id.shape,val_id.shape,test_id.shape)
+
+#采样,然后mini-batch训练
+n_edges=email_graph.num_edges()
+# Create sampler
+sampler = dgl.dataloading.MultiLayerNeighborSampler(
+ [int(fanout) for fanout in args.fan_out.split(',')])
+dataloader = dgl.dataloading.EdgeDataLoader(
+ email_graph, train_id, sampler,
+ exclude='reverse_id', # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊”
+ # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
+ reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
+ batch_size=args.batch_size,
+ shuffle=True,
+ drop_last=False,
+ num_workers=args.num_workers)
+
+## For debug
+for dd in dataloader:
+ """分别是input_nodes, edge_subgraph,blocks"""
+ for xx in dd:
+ print(xx)
+ print()
+ break
+
+class TwoLayerGCN(nn.Module):
+ def __init__(self, in_dim, hid_dim, out_dim):
+ """两层的GCN模型"""
+ super().__init__()
+ self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True)
+ self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True)
+
+ def forward(self, blocks, x):
+ x = F.relu(self.conv1(blocks[0], x))
+ x = F.relu(self.conv2(blocks[1], x))
+ return x
+
+class Predictor(nn.Module):
+ """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出"""
+ def __init__(self, in_dim, num_classes):
+ super().__init__()
+ self.W = nn.Linear(2 * in_dim, num_classes)
+
+ def apply_edges(self, edges):
+ data = th.cat([edges.src['x'], edges.dst['x']], dim=-1)
+ return {'score': self.W(data)}
+
+ def forward(self, edge_subgraph, x):
+ with edge_subgraph.local_scope():
+ edge_subgraph.ndata['x'] = x
+ edge_subgraph.apply_edges(self.apply_edges)
+ return edge_subgraph.edata['score']
+
+class MyModel(nn.Module):
+ """主模型:结构比较清晰"""
+ def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes):
+ super().__init__()
+ self.node_emb = nn.Embedding(num_nodes, emb_dim)
+ self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim)
+ self.predictor = Predictor(out_dim, num_classes)
+
+ def forward(self, edge_subgraph, blocks, input_nodes):
+ x = self.node_emb(input_nodes)
+ x = self.gcn(blocks, x)
+ return self.predictor(edge_subgraph, x)
+
+device = th.device("cuda" if th.cuda.is_available() else "cpu")
+print("device: {}".format(device))
+
+model = MyModel(64, args.num_hidden, args.num_hidden, edge_classes, email_graph.num_nodes())
+model = model.to(device)
+loss_fcn = nn.CrossEntropyLoss() # 交叉熵损失
+optimizer = optim.Adam(model.parameters(), lr=args.lr)
+
+
+def predict(model, email_graph, val_id, device):
+ # Create sampler(全采样)
+ sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
+ dataloader = dgl.dataloading.EdgeDataLoader(
+ email_graph, val_id, sampler, exclude='reverse_id',
+ # For each edge with ID e in dataset, the reverse edge is e ± |E|/2.
+ reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]),
+ batch_size=args.batch_size,
+ shuffle=False,
+ drop_last=False,
+ num_workers=args.num_workers)
+
+ valid_preds = []
+ model.eval()
+ with th.no_grad():
+ for input_nodes, edges_subgraph, blocks in dataloader:
+ edges_subgraph = edges_subgraph.to(device)
+ blocks = [block.int().to(device) for block in blocks]
+ pred = model(edges_subgraph, blocks, input_nodes)
+ pred = pred.cpu().argmax(-1).numpy().tolist()
+ valid_preds.extend(pred)
+ return valid_preds
+
+labels = email_graph.edata['label'] # 图中所有边的标签
+best_val_acc = 0 # 记录验证集上的最好效果
+patience = 0 # For early stopping
+
+# Training loop
+for epoch in range(args.num_epochs):
+ # Loop over the dataloader to sample the computation dependency graph as a list of
+ # blocks.
+ start_time = time.time()
+ all_loss = []
+ trn_label, trn_pred = [], []
+ n_batches = len(dataloader)
+
+ for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader):
+ edges_subgraph = edges_subgraph.to(device)
+ blocks = [block.to(device) for block in blocks]
+
+ # Compute loss and prediction
+ edge_preds = model(edges_subgraph, blocks, input_nodes)
+ loss = loss_fcn(edge_preds, edges_subgraph.edata['label'])
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ all_loss.append(loss.item())
+ trn_label.extend(edges_subgraph.edata['label'].cpu().numpy().tolist())
+ trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist())
+
+ if (step+1) % (n_batches//10) == 0:
+ cur_acc = metrics.accuracy_score(trn_label, trn_pred)
+ print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format(
+ epoch+1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time))
+
+ ## 验证集预测
+ val_preds = predict(model, email_graph, val_id, device)
+ val_acc = metrics.accuracy_score(labels[val_id], val_preds)
+
+ if val_acc > best_val_acc:
+ best_val_acc = val_acc
+ patience = 0
+ th.save(model.state_dict(), "edge_cls_best_model.bin")
+ else:
+ patience += 1
+ print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format(
+ val_acc, best_val_acc, time.time() - start_time))
+
+ ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练
+ if patience > 3:
+ break
+
+model.load_state_dict(th.load("edge_cls_best_model.bin"))
+
+test_preds = predict(model, email_graph, test_id, device)
+print("test acc: {:.4f}".format(metrics.accuracy_score(labels[test_id], test_preds))) \ No newline at end of file
diff --git a/code/emailPreparation.py b/code/emailPreparation.py
new file mode 100644
index 0000000..82f308d
--- /dev/null
+++ b/code/emailPreparation.py
@@ -0,0 +1,57 @@
+from shutil import copyfile
+from sys import exit
+import os
+import sys
+import re
+
+def split_datacon_eml():
+ f = open("C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/answer.txt") # 返回一个文件对象
+ line = f.readline() # 调用文件的 readline()方法
+ fraud_num_list=[]
+ while line:
+ fraud_num_list.append(line.strip())
+ # print(line, end = '')# 在 Python 3 中使用
+ line = f.readline()
+
+ emails = os.listdir("C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/data/")
+ for email in emails:
+ email_num=email.replace(".eml", "")
+ if email_num not in fraud_num_list:
+ continue
+ source="C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/data/"+email_num+".eml"
+ target="C:/Users/gisel/Desktop/毕设/postPath/datacon_1_fraud/"+email_num+".eml"
+
+ try:
+ copyfile(source,target)
+ except IOError as e:
+ print("Unable to copy file. %s" % e)
+ exit(1)
+ except:
+ print("Unexpected error:", sys.exc_info())
+ exit(1)
+
+ print("\nFile copy done!\n")
+
+ f.close()
+
+def nazario_file_to_eml(text_file,email_folder):
+ f = open(text_file,'r',encoding='UTF-8',errors='ignore') # 返回一个文件对象
+ line = f.readline() # 调用文件的 readline()方法
+ num = 0
+ while line:
+ line_list = []
+ while re.match("^From ", line) is None and line:
+ line_list.append(line)
+ # print(line)
+ line = f.readline()
+ # print(line_list)
+ with open(email_folder + "/" + str(num) + ".eml", 'a+',encoding='utf-8') as f_new:
+ for i in line_list:
+ f_new.write(i)
+ num += 1
+ line = f.readline()
+
+ f.close()
+
+if __name__ == "__main__":
+ nazario_file_to_eml("nazario_phishing_2015.txt","nazario_phishing_2015") \ No newline at end of file
diff --git a/code/graphsage.py b/code/graphsage.py
new file mode 100644
index 0000000..1ba5d5a
--- /dev/null
+++ b/code/graphsage.py
@@ -0,0 +1,390 @@
+# -*- coding: utf-8 -*-
+"""GraphSAGE.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1oLF3FCe1CYDohND_VJ6unSXqLRaJ5mi3
+"""
+
+# from google.colab import drive
+# drive.mount('/content/drive')
+
+#采样sampling
+import numpy as np
+
+
+def sampling(src_nodes, sample_num, neighbor_table):
+ """根据源节点采样指定数量的邻居节点,注意使用的是有放回的采样;
+ 某个节点的邻居节点数量少于采样数量时,采样结果出现重复的节点
+
+ Arguments:
+ src_nodes {list, ndarray} -- 源节点列表
+ sample_num {int} -- 需要采样的节点数
+ neighbor_table {dict} -- 节点到其邻居节点的映射表
+
+ Returns:
+ np.ndarray -- 采样结果构成的列表
+ """
+ results = []
+ for sid in src_nodes:
+ # 从节点的邻居中进行有放回地进行采样
+ res = np.random.choice(neighbor_table[sid], size=(sample_num, ))
+ results.append(res)
+ return np.asarray(results).flatten()
+
+def multihop_sampling(src_nodes, sample_nums, neighbor_table):
+ """根据源节点进行多阶采样
+
+ Arguments:
+ src_nodes {list, np.ndarray} -- 源节点id
+ sample_nums {list of int} -- 每一阶需要采样的个数
+ neighbor_table {dict} -- 节点到其邻居节点的映射
+
+ Returns:
+ [list of ndarray] -- 每一阶采样的结果
+ """
+ sampling_result = [src_nodes]
+ for k, hopk_num in enumerate(sample_nums):
+ hopk_result = sampling(sampling_result[k], hopk_num, neighbor_table)
+ sampling_result.append(hopk_result)
+ return sampling_result
+
+#聚合
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+
+class NeighborAggregator(nn.Module):
+ def __init__(self, input_dim, output_dim,
+ use_bias=False, aggr_method="mean"):
+ """聚合节点邻居
+ Args:
+ input_dim: 输入特征的维度
+ output_dim: 输出特征的维度
+ use_bias: 是否使用偏置 (default: {False})
+ aggr_method: 邻居聚合方式 (default: {mean})
+ """
+ super(NeighborAggregator, self).__init__()
+ self.input_dim = input_dim
+ self.output_dim = output_dim
+ self.use_bias = use_bias
+ self.aggr_method = aggr_method
+ self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
+ if self.use_bias:
+ self.bias = nn.Parameter(torch.Tensor(self.output_dim))
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ init.kaiming_uniform_(self.weight)
+ if self.use_bias:
+ init.zeros_(self.bias)
+
+ def forward(self, neighbor_feature):
+ if self.aggr_method == "mean":
+ aggr_neighbor = neighbor_feature.mean(dim=1)
+ elif self.aggr_method == "sum":
+ aggr_neighbor = neighbor_feature.sum(dim=1)
+ elif self.aggr_method == "max":
+ aggr_neighbor = neighbor_feature.max(dim=1)
+ else:
+ raise ValueError("Unknown aggr type, expected sum, max, or mean, but got {}"
+ .format(self.aggr_method))
+
+ neighbor_hidden = torch.matmul(aggr_neighbor, self.weight)
+ if self.use_bias:
+ neighbor_hidden += self.bias
+
+ return neighbor_hidden
+
+ def extra_repr(self):
+ return 'in_features={}, out_features={}, aggr_method={}'.format(
+ self.input_dim, self.output_dim, self.aggr_method)
+
+class SageGCN(nn.Module):
+ def __init__(self, input_dim, hidden_dim,
+ activation=F.relu,
+ aggr_neighbor_method="mean",
+ aggr_hidden_method="sum"):
+ """SageGCN层定义
+ Args:
+ input_dim: 输入特征的维度
+ hidden_dim: 隐层特征的维度,
+ 当aggr_hidden_method=sum, 输出维度为hidden_dim
+ 当aggr_hidden_method=concat, 输出维度为hidden_dim*2
+ activation: 激活函数
+ aggr_neighbor_method: 邻居特征聚合方法,["mean", "sum", "max"]
+ aggr_hidden_method: 节点特征的更新方法,["sum", "concat"]
+ """
+ super(SageGCN, self).__init__()
+ assert aggr_neighbor_method in ["mean", "sum", "max"]
+ assert aggr_hidden_method in ["sum", "concat"]
+ self.input_dim = input_dim
+ self.hidden_dim = hidden_dim
+ self.aggr_neighbor_method = aggr_neighbor_method
+ self.aggr_hidden_method = aggr_hidden_method
+ self.activation = activation
+ self.aggregator = NeighborAggregator(input_dim, hidden_dim,
+ aggr_method=aggr_neighbor_method)
+ self.b = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ init.kaiming_uniform_(self.b)
+
+ def forward(self, src_node_features, neighbor_node_features):
+ neighbor_hidden = self.aggregator(neighbor_node_features)
+ self_hidden = torch.matmul(src_node_features, self.b)
+
+ if self.aggr_hidden_method == "sum":
+ hidden = self_hidden + neighbor_hidden
+ elif self.aggr_hidden_method == "concat":
+ hidden = torch.cat([self_hidden, neighbor_hidden], dim=1)
+ else:
+ raise ValueError("Expected sum or concat, got {}"
+ .format(self.aggr_hidden))
+ if self.activation:
+ return self.activation(hidden)
+ else:
+ return hidden
+
+ def extra_repr(self):
+ output_dim = self.hidden_dim if self.aggr_hidden_method == "sum" else self.hidden_dim * 2
+ return 'in_features={}, out_features={}, aggr_hidden_method={}'.format(
+ self.input_dim, output_dim, self.aggr_hidden_method)
+
+class GraphSage(nn.Module):
+ def __init__(self, input_dim, hidden_dim,
+ num_neighbors_list):
+ super(GraphSage, self).__init__()
+ self.input_dim = input_dim
+ self.hidden_dim = hidden_dim
+ self.num_neighbors_list = num_neighbors_list
+ self.num_layers = len(num_neighbors_list)
+ self.gcn = nn.ModuleList()
+ self.gcn.append(SageGCN(input_dim, hidden_dim[0]))
+ for index in range(0, len(hidden_dim) - 2):
+ self.gcn.append(SageGCN(hidden_dim[index], hidden_dim[index+1]))
+ self.gcn.append(SageGCN(hidden_dim[-2], hidden_dim[-1], activation=None))
+
+ def forward(self, node_features_list):
+ hidden = node_features_list
+ for l in range(self.num_layers):
+ next_hidden = []
+ gcn = self.gcn[l]
+ for hop in range(self.num_layers - l):
+ src_node_features = hidden[hop]
+ src_node_num = len(src_node_features)
+ neighbor_node_features = hidden[hop + 1] \
+ .view((src_node_num, self.num_neighbors_list[hop], -1))
+ h = gcn(src_node_features, neighbor_node_features)
+ next_hidden.append(h)
+ hidden = next_hidden
+ return hidden[0]
+
+ def extra_repr(self):
+ return 'in_features={}, num_neighbors_list={}'.format(
+ self.input_dim, self.num_neighbors_list
+ )
+
+#数据处理
+import os
+import os.path as osp
+import pickle
+import itertools
+import scipy.sparse as sp
+import urllib
+from collections import namedtuple
+import numpy as np
+
+Data = namedtuple('Data',['x','y','adjacency_dict','train_mask','val_mask','test_mask'])
+
+class CoraData(object):
+ download_url = "https://github.com/kimiyoung/planetoid/raw/master/data"
+ filenames = ["ind.cora.{}".format(name) for name in ['x','tx','allx','y','ty','ally','graph','test.index']]
+
+ def __init__(self,data_root="cora",rebuild=False):
+ """Cora数据,包括数据下载,处理,加载等功能
+ 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘
+ 处理之后的数据可以通过属性.data获得,它将返回一个数据对象,包括如下几部分:
+ * x:节点的特征,维度为2708*1433,类型为np.ndarray
+ * y:节点的标签,总共包括7个类别,类型为np.ndarray
+ * adjacency_dict:邻接信息,类型为dict
+ * train_mask:训练集掩码向量,维度为2708,当节点属于训练集时,相应位置为True,否则False
+ * val_mask:验证集掩码向量,维度为2708,当节点属于验证集时,相应位置为True,否则False
+ * test_mask: 测试集掩码向量,维度为2708,当节点属于测试集时,相应位置为True,否则False
+
+ Args:
+ ------
+ data_root:string, optional
+ 存放数据的目录,原始数据路径:{data_root}/raw
+ 缓存数据路径:{data_root}/processed_cora.pkl
+ rebuild:boolean,optional
+ 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据
+ """
+ self.data_root=data_root
+ save_file=osp.join(self.data_root,"processed_cora.pkl")
+ if osp.exists(save_file) and not rebuild:
+ print("Using Cached file: {}".format(save_file))
+ self._data = pickle.load(open(save_file,"rb"))
+ else:
+ self.maybe_download()
+ self._data=self.process_data()
+ with open(save_file,"wb") as f:
+ pickle.dump(self.data,f)
+ print("Cached file: {}".format(save_file))
+
+ @property
+ def data(self):
+ """返回Data数据对象,包括x, y, adjacency, train_mask, val_mask, test_mask"""
+ return self._data
+ def process_data(self):
+ """
+ 处理数据,得到节点特征和标签,邻接矩阵,训练集、验证集以及测试集
+ 引用自:https://github.com/rusty1s/pytorch_geometric
+ """
+ print("Process data ...")
+ _,tx,allx,y,ty,ally,graph,test_index=[self.read_data(
+ osp.join(self.data_root,"raw",name)) for name in self.filenames]
+ train_index = np.arange(y.shape[0])
+ val_index = np.arange(y.shape[0],y.shape[0]+500)
+ sorted_test_index = sorted(test_index)
+
+ x=np.concatenate((allx,tx),axis=0)
+ y=np.concatenate((ally,ty),axis=0).argmax(axis=1)
+
+ x[test_index] = x[sorted_test_index]
+ y[test_index]= y[sorted_test_index]
+ num_nodes=x.shape[0]
+
+ train_mask=np.zeros(num_nodes,dtype=np.bool)
+ val_mask=np.zeros(num_nodes,dtype=np.bool)
+ test_mask = np.zeros(num_nodes, dtype=np.bool)
+ train_mask[train_index] = True
+ val_mask[val_index] = True
+ test_mask[test_index] = True
+ adjacency_dict=graph
+ print("Node's feature shape:",x.shape)
+ print("Node's label shape: ", y.shape)
+ print("Adjacency's shape: ", len(adjacency_dict))
+ print("Number of training nodes: ", train_mask.sum())
+ print("Number of validation nodes: ", val_mask.sum())
+ print("Number of test nodes: ", test_mask.sum())
+ return Data(x=x, y=y, adjacency_dict=adjacency_dict,
+ train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
+
+ def maybe_download(self):
+ save_path = os.path.join(self.data_root, "raw")
+ for name in self.filenames:
+ if not osp.exists(osp.join(save_path, name)):
+ self.download_data("{}/{}".format(self.download_url, name), save_path)
+
+ @staticmethod
+ def build_adjacency(adj_dict):
+ """根据邻接表创建邻接矩阵"""
+ edge_index = []
+ num_nodes = len(adj_dict)
+ for src, dst in adj_dict.items():
+ edge_index.extend([src, v] for v in dst)
+ edge_index.extend([v, src] for v in dst)
+ # 去除重复的边
+ edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
+ edge_index = np.asarray(edge_index)
+ adjacency = sp.coo_matrix((np.ones(len(edge_index)),(edge_index[:, 0], edge_index[:, 1])),
+ shape=(num_nodes, num_nodes), dtype="float32")
+ return adjacency
+
+ @staticmethod
+ def read_data(path):
+ """使用不同的方式读取原始数据以进一步处理"""
+ name = osp.basename(path)
+ if name == "ind.cora.test.index":
+ out = np.genfromtxt(path, dtype="int64")
+ return out
+ else:
+ out = pickle.load(open(path, "rb"), encoding="latin1")
+ out = out.toarray() if hasattr(out, "toarray") else out
+ return out
+
+ @staticmethod
+ def download_data(url, save_path):
+ """数据下载工具,当原始数据不存在时将会进行下载"""
+ # print(save_path)
+ if not os.path.exists(save_path):
+ os.makedirs(save_path)
+ data = urllib.request.urlopen(url)
+ filename = os.path.split(url)[-1]
+
+ with open(os.path.join(save_path, filename), 'wb') as f:
+ f.write(data.read())
+ # print(os.path.join(save_path, filename))
+ return True
+
+# data = CoraData().data
+
+#主函数
+import torch
+
+import numpy as np
+import torch.nn as nn
+import torch.optim as optim
+from collections import namedtuple
+
+#数据准备
+Data = namedtuple('Data', ['x', 'y', 'adjacency_dict','train_mask', 'val_mask', 'test_mask'])
+
+data = CoraData().data
+x = data.x / data.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1
+
+train_index = np.where(data.train_mask)[0]
+train_label = data.y[train_index]
+test_index = np.where(data.test_mask)[0]
+
+#模型初始化
+INPUT_DIM = 1433 # 输入维度
+# Note: 采样的邻居阶数需要与GCN的层数保持一致
+HIDDEN_DIM = [128, 7] # 隐藏单元节点数
+NUM_NEIGHBORS_LIST = [10, 10] # 每阶采样邻居的节点数
+assert len(HIDDEN_DIM) == len(NUM_NEIGHBORS_LIST)
+BTACH_SIZE = 16 # 批处理大小
+EPOCHS = 20
+NUM_BATCH_PER_EPOCH = 20 # 每个epoch循环的批次数
+LEARNING_RATE = 0.01 # 学习率
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+model = GraphSage(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM,
+ num_neighbors_list=NUM_NEIGHBORS_LIST).to(DEVICE)
+print(model)
+criterion = nn.CrossEntropyLoss().to(DEVICE)
+optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=5e-4)
+
+#模型训练和测试
+def train():
+ model.train()
+ for e in range(EPOCHS):
+ for batch in range(NUM_BATCH_PER_EPOCH):
+ batch_src_index = np.random.choice(train_index, size=(BTACH_SIZE,))
+ batch_src_label = torch.from_numpy(train_label[batch_src_index]).long().to(DEVICE)
+ batch_sampling_result = multihop_sampling(batch_src_index, NUM_NEIGHBORS_LIST, data.adjacency_dict)
+ batch_sampling_x = [torch.from_numpy(x[idx]).float().to(DEVICE) for idx in batch_sampling_result]
+ batch_train_logits = model(batch_sampling_x)
+ loss = criterion(batch_train_logits, batch_src_label)
+ optimizer.zero_grad()
+ loss.backward() # 反向传播计算参数的梯度
+ optimizer.step() # 使用优化方法进行梯度更新
+ print("Epoch {:03d} Batch {:03d} Loss: {:.4f}".format(e, batch, loss.item()))
+ test()
+
+
+def test():
+ model.eval()
+ with torch.no_grad():
+ test_sampling_result = multihop_sampling(test_index, NUM_NEIGHBORS_LIST, data.adjacency_dict)
+ test_x = [torch.from_numpy(x[idx]).float().to(DEVICE) for idx in test_sampling_result]
+ test_logits = model(test_x)
+ test_label = torch.from_numpy(data.y[test_index]).long().to(DEVICE)
+ predict_y = test_logits.max(1)[1]
+ accuarcy = torch.eq(predict_y, test_label).float().mean().item()
+ print("Test Accuracy: ", accuarcy)
+
+train() \ No newline at end of file
diff --git a/code/hunterGraph.py b/code/hunterGraph.py
new file mode 100644
index 0000000..6332552
--- /dev/null
+++ b/code/hunterGraph.py
@@ -0,0 +1,286 @@
+import networkx as nx
+import csv
+
+def show_connected_subgraphs(edge_file):
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ largest = max(nx.connected_components(G),key=len)
+ largest_connected_subgraph = G.subgraph(largest)
+ node_num_list=[]
+ edge_num_list=[]
+
+ for c in sorted(nx.connected_components(G),key=len,reverse=True):
+ subgraph=G.subgraph(c)
+ node_num_list.append(nx.number_of_nodes(subgraph))
+ edge_num_list.append(nx.number_of_edges(subgraph))
+ # with open("subgraph_edges.txt", 'a+', encoding="utf-8") as f:
+ # f.write(str(subgraph.edges)+"\n")
+
+
+ import matplotlib.pyplot as plt
+ import numpy as np
+
+ x = np.array(node_num_list)
+ y = np.array(edge_num_list)
+
+ plt.xlabel("nodes")
+ plt.ylabel("edges")
+ plt.scatter(x, y)
+ plt.show()
+
+def node_type_count(node_file):
+ node_count_dict={}
+ with open(node_file, 'r', encoding="utf-8") as nodefile:
+ nodes = csv.DictReader(nodefile)
+ for node in nodes:
+ if node["type"] in node_count_dict:
+ node_count_dict[node["type"]]+=1
+ else:
+ node_count_dict[node["type"]]=1
+ print(node_count_dict)
+
+import pandas as pd
+def benign_fraud_count(node_file,subgraph_node_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ fraud_count=0
+ benign_count=0
+ domain_count=0
+ IP_count=0
+ x_mailer_count=0
+ with open(subgraph_node_file, 'r', encoding="utf-8") as f:
+ line=f.readline().strip()
+ line=line.replace("['","")
+ line=line.replace("']","")
+ sub_nodes=line.split("', '")
+ # print(nodes['index'].dtypes)
+ for sub_node in sub_nodes:
+ # print(type(sub_node))
+ node_index=nodes[(nodes['index']==int(sub_node))].index.tolist()[0]
+ node_name=nodes.at[node_index,'name']
+ node_type = nodes.at[node_index, 'type']
+ if node_type==0:
+ node_num=node_name[5:]
+ if int(node_num) <= 6550 and int(node_num) >=6264:
+ fraud_count+=1
+ else:
+ benign_count+=1
+ else:
+
+ if node_type==1 :
+ domain_count+=1
+ elif node_type==2:
+ IP_count+=1
+ else:
+ x_mailer_count+=1
+
+ print("fraud: "+str(fraud_count))
+ print("benign: "+str(benign_count))
+ print("domain: "+str(domain_count))
+ print("IP: "+str(IP_count))
+ print("x-mailer: "+ str(x_mailer_count))
+ # node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
+ # edge_list.append(str(node1_index)+","+str(node2_index))
+
+def merge_meta_path(edge_file,meta_path_file):
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ # largest = max(nx.connected_components(G),key=len)
+ # largest_connected_subgraph = G.subgraph(largest)
+ subgraph_edges=list(G.edges)
+ # print(subgraph_edges)
+ meta_path_list=[]
+ for i in range(0,len(subgraph_edges)):
+ for j in range(0,len(subgraph_edges)):
+ if subgraph_edges[i][1] == subgraph_edges[j][0]:
+ meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][1]))
+ elif subgraph_edges[i][1] == subgraph_edges[j][1]:
+ meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][0]))
+ print(meta_path_list)
+ with open(meta_path_file, 'w', encoding="utf-8") as f:
+ f.write("node1,path,node2\n")
+ for meta_path in meta_path_list:
+ f.write(meta_path[0]+","+meta_path[1]+","+meta_path[2]+"\n")
+
+def new_index_to_subgraph(edge_file,subgraph_index_file,node_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ # largest = max(nx.connected_components(G),key=len)
+ # largest_connected_subgraph = G.subgraph(largest)
+ subgraph_edges=list(G.edges)
+ with open(subgraph_index_file,'w',encoding='utf-8') as new_index_file:
+ new_index_file.write("oldIndex,newIndex,label\n")
+ index=1
+ new_node_dict={}
+ for edge in subgraph_edges:
+ node_index = nodes[(nodes['index'] == int(edge[0]))].index.tolist()[0]
+ node_name = nodes.at[node_index, 'name']
+ node_type = nodes.at[node_index,'type']
+ if node_type == 0:
+ if node_name[5:] in new_node_dict:
+ continue
+ else:
+ if int(node_name[5:])<2010:
+ new_node_dict[node_name[5:]]=str(index)
+ new_index_file.write(edge[0]+","+str(index)+",1\n")
+ index+=1
+ else:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[0] + "," + str(index) + ",0\n")
+ index+=1
+ node_index = nodes[(nodes['index'] == int(edge[1]))].index.tolist()[0]
+ node_name = nodes.at[node_index, 'name']
+ node_type = nodes.at[node_index, 'type']
+ if node_type == 0:
+ if node_name[5:] in new_node_dict:
+ continue
+ else:
+ if int(node_name[5:]) < 2010:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[1] + "," + str(index) + ",1\n")
+ index += 1
+ else:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[1] + "," + str(index) + ",0\n")
+ index += 1
+
+
+def split_meta_path(node_file,meta_path_file,index_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ indexes = pd.read_csv(index_file,encoding='utf-8')
+ EDE_list=[]
+ EIE_list=[]
+ EXE_list=[]
+ with open(meta_path_file, 'r', encoding="utf-8") as f:
+ paths=csv.DictReader(f)
+ for path in paths:
+ node_index=nodes[(nodes['index']==int(path['path']))].index.tolist()[0]
+ node_type = nodes.at[node_index, 'type']
+ if node_type == 1:
+ EDE_list.append((path['node1'],path['node2']))
+ elif node_type == 2:
+ EIE_list.append((path['node1'],path['node2']))
+ elif node_type == 3:
+ EXE_list.append((path['node1'],path['node2']))
+ with open("EDE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for ede in EDE_list:
+ # node1_name = nodes.at[int(ede[0]),'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(ede[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index,'newIndex']
+ # node2_name = nodes.at[int(ede[1]),'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(ede[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num)+","+str(node2_num)+"\n")
+ with open("EIE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for eie in EIE_list:
+ # node1_name = nodes.at[int(eie[0]), 'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(eie[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index, 'newIndex']
+ # node2_name = nodes.at[int(eie[1]), 'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(eie[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num)+","+str(node2_num)+"\n")
+ with open("EXE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for exe in EXE_list:
+ # node1_name = nodes.at[int(exe[0]), 'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(exe[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index, 'newIndex']
+ # node2_name = nodes.at[int(exe[1]), 'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(exe[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num) + "," + str(node2_num) + "\n")
+
+import numpy as np
+def meta_path_to_matrix(meta_path_file):
+ num = [[0 for i in range(0, 6975)] for j in range(0, 6975)]
+ with open(meta_path_file, 'r') as f:
+ cols = csv.DictReader(f)
+ for col in cols:
+ num[int(col["eml1"])-1][int(col["eml2"])-1]=1
+ num[int(col["eml2"])-1][int(col["eml1"])-1] = 1
+ for i in range(0,6975):
+ num[i][i] = 1
+ arr = np.array(num)
+ return arr
+
+def extract_label(label_file):
+ num = [[0 for i in range(0, 2)] for j in range(0, 6975)]
+ with open(label_file, 'r') as f:
+ cols = csv.DictReader(f)
+ for col in cols:
+ if int(col["label"]) == 1:
+ num[int(col["newIndex"])-1][0] = 1
+ elif int(col["label"]) ==0:
+ num[int(col["newIndex"])-1][1] =1
+ arr = np.array(num)
+ return arr
+
+import random
+def generate_features():
+ features = [[0 for i in range(0, 8)] for j in range(0, 6975)]
+ for i in range(0,6975):
+ length=random.randint(1,8)
+ for j in range(0,length):
+ loc = random.randint(0,7)
+ features[i][loc]=1
+ features = np.array(features)
+ return features
+
+from scipy.io import savemat
+def save_data(EDE_file,EIE_file,EXE_file,label_file,mat_file):
+ shuffled_index = np.random.permutation(6975)
+ split_index1 = int(6975 * 0.6)
+ split_index2 = int(6975*0.8)
+ train_index = shuffled_index[:split_index1]
+ train_idx = np.array([train_index])
+ val_index = shuffled_index[split_index1:split_index2]
+ val_idx = np.array([val_index])
+ test_index = shuffled_index[split_index2:]
+ test_idx = np.array([test_index])
+ label = extract_label(label_file)
+ EDE = meta_path_to_matrix(EDE_file)
+ EIE = meta_path_to_matrix(EIE_file)
+ EXE = meta_path_to_matrix(EXE_file)
+ features = generate_features()
+ savemat(mat_file,{'EIE':EIE,'EDE':EDE,'EXE':EXE,'features':features,'label':label,'train_idx':train_idx,'val_idx':val_idx,'test_idx':test_idx})
+
+
+
+if __name__ =="__main__":
+ # benign_fraud_count("hunter_node.csv","first_subgraph_nodes1.txt")
+ # node_type_count("hunter_node.csv")
+ # show_connected_subgraphs("hunter_edge_index_only.csv")
+ # merge_meta_path("hunter_edge_index_only.csv","meta_path_original.csv")
+ # split_meta_path("hunter_node.csv","meta_path_original.csv","subgraph_index.csv")
+ # meta_path_to_matrix("EDE_list.csv")
+ # new_index_to_subgraph("hunter_edge_index_only.csv","subgraph_index.csv","hunter_node.csv")
+ # extract_label("subgraph_index.csv")
+ save_data("EDE_list.csv","EIE_list.csv","EXE_list.csv","subgraph_index.csv","SG_dataset.mat")
+ # generate_features() \ No newline at end of file
diff --git a/code/parseEml.py b/code/parseEml.py
new file mode 100644
index 0000000..a5bd50e
--- /dev/null
+++ b/code/parseEml.py
@@ -0,0 +1,324 @@
+#-*- encoding: gb2312 -*-
+import email
+import os
+import re
+import csv
+import json
+
+# def sort_key(received_header):
+# received_date = email.utils.parsedate_tz(received_header)
+# return received_date
+
+class parseEml:
+
+ def __init__(self, email_path):
+ self.email_path=email_path
+ self.msg=self.get_message()
+
+ # ��ȡ�ļ�
+ def read_mail(self):
+ if os.path.exists(self.email_path):
+ with open(self.email_path) as fp:
+ for line in fp:
+ print(line)
+ else:
+ print('�������!')
+
+ # ������Ϣ����
+ def get_message(self):
+ if os.path.exists(self.email_path):
+ fp = open(self.email_path, 'r', encoding='UTF-8',errors='ignore')
+ # print(self.email_path)
+ return email.message_from_file(fp)
+ else:
+ print('�������!')
+
+ # ��ȡ�ʼ���Received
+ def get_received(self):
+ if self.msg != None:
+ received_array=self.msg.get_all('Received')
+ return received_array
+ else:
+ print('msg is empty!')
+
+ def get_message_id(self):
+ if self.msg != None:
+ return self.msg.get('Message-ID')
+ else:
+ print('msg is empty!')
+
+ def get_from(self):
+ if self.msg != None:
+ return self.msg.get('From')
+ else:
+ print('msg is empty!')
+
+ def get_to(self):
+ if self.msg != None:
+ return self.msg.get('To')
+ else:
+ print('msg is empty!')
+
+ def get_x_mailer(self):
+ if self.msg != None:
+ return self.msg.get('X-Mailer')
+ else:
+ print('msg is empty!')
+
+ def get_from_host_list(self):
+ if self.msg != None:
+ from_host_list=[]
+ received_array=self.msg.get_all('Received')
+ if received_array == None:
+ return None
+ for received in received_array:
+ received = re.sub(r'\n\s*', " ", received)
+ # print(received)
+ if "from" in received and "by" in received:
+ from_host = received[received.find("from") + 5:received.find("by")].strip()
+ from_host_list.append(from_host)
+ # print(from_host)
+ return from_host_list
+ else:
+ print('msg is empty!')
+
+ def get_dkim(self):
+ if self.msg != None:
+ return self.msg.get('DKIM-Signature')
+ else:
+ print('msg is empty!')
+
+ def get_auth_results(self):
+ if self.msg != None:
+ return self.msg.get('Authentication-Results')
+ else:
+ print('msg is empty!')
+
+def list_to_file(alist,file_name):
+ with open(file_name, 'a+') as f:
+ for i in alist:
+ f.write(i + '\n')
+
+def list_to_csv(alist,file_name,file_num):
+ with open(file_name, 'a+', newline='') as f:
+ writer = csv.writer(f)
+ for i in alist:
+ writer.writerow([i,file_num])
+
+def extract_from_host(folder_path,file_name):
+ from_host_list=[]
+ files = os.listdir(folder_path) # �õ��ļ����µ������ļ�����
+ for file in files: # �������
+ received_array = parseEml(folder_path+"/"+ file).get_received()
+ for received in received_array:
+ received = re.sub(r'\n\s*', " ", received)
+ # print(received)
+ if "from" in received and "by" in received:
+ from_host = received[received.find("from") + 5:received.find("by")].strip()
+ # if from_host=="":
+ # print(received)
+ from_host_list.append(from_host)
+ # print(from_host)
+ list_to_file(from_host_list,file_name)
+
+def extract_message_id(folder_path,file_name):
+ files = os.listdir(folder_path) # �õ��ļ����µ������ļ�����
+ record_list=[]
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ file_num = file.replace(".eml", "")
+ mail=parseEml(folder_path+"/"+ file)
+ message_id=mail.get_message_id()
+ sender=mail.get_from()
+ receiver=mail.get_to()
+ record={"Message-ID":message_id,"From":sender,"To":receiver,"num:":file_num}
+ record_list.append(record)
+ with open(file_name, 'a+', newline='',encoding='utf-8') as f:
+ json.dump(record_list,f)
+
+def extract_x_mailer(folder_path,file_name):
+ files = os.listdir(folder_path) # �õ��ļ����µ������ļ�����
+ record_list = []
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ file_num = file.replace(".eml", "")
+ mail = parseEml(folder_path + "/" + file)
+ x_mailer=mail.get_x_mailer()
+ if x_mailer != None:
+ message_id=mail.get_message_id()
+ sender = mail.get_from()
+ receiver = mail.get_to()
+ record = {"Message-ID":message_id,"X-Mailer": x_mailer, "From": sender, "To": receiver, "num:": file_num}
+ record_list.append(record)
+ with open(file_name, 'a+', newline='', encoding='utf-8') as f:
+ json.dump(record_list, f)
+
+def extract_received_x_mailer(folder_path,file_name):
+ files=os.listdir(folder_path)
+ for file in files: # �������
+ record = ""
+ if file == "duplicate":
+ continue
+ file_num = file.replace(".eml", "")
+ mail = parseEml(folder_path + "/" + file)
+ received_array=mail.get_received()
+ x_mailer = mail.get_x_mailer()
+ for received in received_array:
+ received = re.sub(r'\n\s*', " ", received)
+ received=received[:received.find(";")]
+ record+="Received: "+received+"; "
+ if x_mailer != None:
+ record+="X-Mailer: "+x_mailer
+ with open(file_name, 'a+', newline='', encoding='utf-8') as f:
+ f.writelines(record)
+ f.writelines("\n")
+
+def to_count(folder_path):
+ files=os.listdir(folder_path)
+ receiver_list=[]
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ mail=parseEml(folder_path + "/" + file)
+ receiver=mail.get_to()
+ receiver_list.append(receiver)
+ dict = {}
+ for key in receiver_list:
+ dict[key] = dict.get(key, 0) + 1
+ print(dict)
+ print(max(set(receiver_list), key=receiver_list.count))
+
+def extract_to(folder_path,file_name):
+ files = os.listdir(folder_path)
+ receiver_list = []
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ mail = parseEml(folder_path + "/" + file)
+ receiver = mail.get_to()
+ if receiver != None:
+ receiver_list.append(receiver)
+ list_to_file(receiver_list,file_name)
+
+def read_template(template_file):
+ f = open(template_file) # ����һ���ļ�����
+ line = f.readline() # ������� readline()����
+ line_list = []
+ while line:
+ line1 = line.strip()
+ if line1:
+ line_list.append(line1)
+ line = f.readline()
+ f.close()
+ return line_list
+
+def extract_edge(email_path,template_list):
+ mail=parseEml(email_path)
+ received_array = mail.get_received()
+ receiver=mail.get_to()
+ if receiver:
+ receiver=receiver.replace("\"","\\\"")
+ else:
+ receiver=""
+ # print(received_array)
+ edge_list = []
+ from_host_list=[]
+ for received in received_array:
+ received = re.sub(r'\n\s*', " ", received)
+ # print(received)
+ if "from" in received and "by" in received:
+ from_host = received[received.find("from") + 5:received.find("by")].strip()
+ for template in template_list:
+ if "<:*:>" in template:
+ template = template.replace("<:*:>", ".*")
+ if "(" in template:
+ template = template.replace("(", "\(")
+ if ")" in template:
+ template=template.replace(")","\)")
+ if "[" in template:
+ template=template.replace("[","\[")
+ if re.match(template, from_host):
+ from_host=template
+ break
+ from_host=from_host.replace("\"","\\\"")
+ from_host_list.append(from_host)
+ # print(from_host)
+ num=len(from_host_list)
+ if num > 0:
+ last_received=received_array[0]
+ last_node=last_received[last_received.find("by")+3:last_received.find("with")].strip()
+ edge_list.append("\""+last_node+"\" -> \""+receiver+"\"")
+ edge_list.append("\""+from_host_list[0]+"\" -> \""+last_node+"\"")
+ if num >=2:
+ for i in range(0,num-2):
+ edge_list.append("\""+from_host_list[i+1]+"\" -> \""+from_host_list[i]+"\"")
+ return edge_list
+
+def extract_path(email_path,email_num):
+ mail = parseEml(email_path)
+ received_array = mail.get_received()
+ sender=mail.get_from()
+ receiver = mail.get_to()
+ from_host_list = []
+ path=""
+ for received in received_array:
+ received = re.sub(r'\n\s*', " ", received)
+ # print(received)
+ if "from" in received and "by" in received:
+ from_host = received[received.find("from") + 5:received.find("by")].strip()
+ from_host_list.append(from_host)
+ # print(from_host)
+ num = len(from_host_list)
+ if num > 0:
+ last_received = received_array[0]
+ last_node = last_received[last_received.find("by") + 3:last_received.find("with")].strip()
+ if receiver:
+ path= last_node + "," + receiver
+ else:
+ path=last_node+","
+ path= from_host_list[0] + " -> " + path
+ if num >= 2:
+ for i in range(1, num):
+ path=from_host_list[i] + " -> " + path
+ path=sender+","+path
+ path=email_num+ ","+ path
+ return path
+
+def extract_all_edges(email_folder,template_path,file_name):
+ template_list = read_template(template_path)
+ files = os.listdir(email_folder)
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ mail_path = extract_edge(email_folder + "/" + file,template_list)
+ with open(file_name, 'a+') as f:
+ for i in mail_path:
+ f.write(i + ' [color=red];\n')
+
+def extract_all_paths(email_folder,file_name):
+ with open(file_name, 'a+') as f:
+ f.write('email_num,from,path,receiver\n')
+ files = os.listdir(email_folder)
+ for file in files: # �������
+ if file == "duplicate":
+ continue
+ mail_path = extract_path(email_folder + "/" + file,file.replace(".eml", ""))
+ with open(file_name, 'a+') as f:
+ f.write(mail_path + '\n')
+
+def delete_once_node(edge_file):
+ f = open(edge_file) # ����һ���ļ�����
+ line = f.readline() # ������� readline()����
+ line_list = []
+ while line:
+ line1 = line.strip()
+ if line1:
+ line_list.append(line1)
+ line = f.readline()
+ f.close()
+ return line_list
+
+if __name__=="__main__":
+ extract_all_paths("nazario_phishing_2021","nazario_paths.csv") \ No newline at end of file