diff options
| author | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
|---|---|---|
| committer | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
| commit | 7592577acc00163e98b45bba86ef76bd37f93854 (patch) | |
| tree | 671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code | |
| parent | 5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff) | |
reorganize
Diffstat (limited to 'code')
| -rw-r--r-- | code/analysisDataset.py | 47 | ||||
| -rw-r--r-- | code/buildGraph.py | 592 | ||||
| -rw-r--r-- | code/buildGraph2.py | 106 | ||||
| -rw-r--r-- | code/buildGraphviz.py | 24 | ||||
| -rw-r--r-- | code/buildSubGraph.py | 72 | ||||
| -rw-r--r-- | code/doStatistics.py | 25 | ||||
| -rw-r--r-- | code/drawGraph.py | 81 | ||||
| -rw-r--r-- | code/edgeClassification.py | 268 | ||||
| -rw-r--r-- | code/edgeClassificationEmailGraph.py | 297 | ||||
| -rw-r--r-- | code/emailPreparation.py | 57 | ||||
| -rw-r--r-- | code/graphsage.py | 390 | ||||
| -rw-r--r-- | code/hunterGraph.py | 286 | ||||
| -rw-r--r-- | code/parseEml.py | 324 |
13 files changed, 2569 insertions, 0 deletions
diff --git a/code/analysisDataset.py b/code/analysisDataset.py new file mode 100644 index 0000000..34c7a06 --- /dev/null +++ b/code/analysisDataset.py @@ -0,0 +1,47 @@ +from parseEml import parseEml +import os + +def read_spf(email_path): + mail = parseEml(email_path) + auth_result = mail.get_auth_results() + if auth_result and "spf=" in auth_result: + tmp_list = auth_result.split("spf=") + spf_result=tmp_list[1].split(" ")[0] + else: + spf_result=None + return spf_result + +#输入:邮件文件夹路径 +#输出:spf认证结果统计dict +def spf_count(email_folder): + spf_count_dict={} + files = os.listdir(email_folder) + print(email_folder+":\ntotal: "+str(len(files))) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + spf_result=read_spf(email_folder+"/"+file) + if spf_result in spf_count_dict: + spf_count_dict[spf_result]+=1 + else: + spf_count_dict[spf_result]=1 + print(spf_count_dict) + +#输入:中间域名列表 +#输出:域名出现次数统计结果 +def inter_domain_count(inter_domain_file): + count_dict={} + with open(inter_domain_file,"r",encoding="utf-8") as f: + line = f.readline() + while line: + line=line.strip() + if line in count_dict: + count_dict[line]+=1 + else: + count_dict[line]=1 + line = f.readline() + a = sorted(count_dict.items(), key=lambda x: x[1], reverse=True) + print(a) + +if __name__ == "__main__": + inter_domain_count("inter_domain.txt")
\ No newline at end of file diff --git a/code/buildGraph.py b/code/buildGraph.py new file mode 100644 index 0000000..8cb8802 --- /dev/null +++ b/code/buildGraph.py @@ -0,0 +1,592 @@ +from parseEml import parseEml +import os +import re +from shutil import copyfile + +def extract_node_edge(email_path): + # 节点类型:0:sender domain;1:inter domain;2:IP;3:client + # 边类型:0:sender domain-inter domain;1:inter domain-IP或者IP-inter domain; + # 边类型:2:sender domain-message id domain; 3:sender domain-x_mailer; 4:sender domain-dkim domain + # inter_node_list[每跳域名、IP为一个set] + node_list=set() + edge_list=[] + mail=parseEml(email_path) + raw_node_list=mail.get_from_host_list() + raw_node_list.insert(0,mail.get_from()) + # print(raw_node_list) + inter_node_list=[] + sender_domain=None + for node in raw_node_list: + if '@' in node: + node=node.split('@')[-1] + if '>' in node: + node=node.split(">")[0] + if ')' in node: + node = node.split(")")[0] + if ',' in node: + node=node.replace(","," ") + sender_domain=node + # if "kennadi" in sender_domain: + # print(email_path) + node_list.add(node+",0") + else: + inter_domain_ip=set() + inter_nodes=re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+',node) + # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node) + if len(inter_nodes)!=0: + for inter_node in inter_nodes: + if len(re.findall(r'[-a-zA-Z]',inter_node)): + domain_sets = inter_node.split(".") + if len(domain_sets) > 2: + inter_node = ".".join(domain_sets[1:]) + if sender_domain: + edge_list.append(sender_domain+","+inter_node+",0")#发件域到中间域的边 + inter_domain_ip.add(inter_node) + node_list.add(inter_node + ",1") + else: + inter_domain_ip.add(inter_node) + node_list.add(inter_node+",2") + if len(inter_domain_ip): + inter_node_list.append(inter_domain_ip) + # print(node_list) + print(sender_domain) + print(inter_node_list) + for domain_ip_set in inter_node_list: + if len(domain_ip_set) > 1: + domain_ip_list=list(domain_ip_set) + for i in range(0,len(domain_ip_list)-1): + for j in range(i+1,len(domain_ip_list)): + edge_list.append(domain_ip_list[i]+","+domain_ip_list[j]+",1") + print(edge_list) + return node_list,edge_list + +def extract_sender_and_received(email_folder,node_file,edge_file): + with open(node_file, 'a+') as f: + f.write('index,name,type\n') + with open(edge_file,'a+') as edge_f: + edge_f.write('node1,node2,type\n') + node_list=set() + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + nodes,edges=extract_node_edge(email_folder + "/" + file) + node_list.update(nodes) + with open(edge_file,'a+') as edge_f: + for edge in edges: + edge_f.write(edge+"\n") + with open(node_file, 'a+',encoding="utf-8") as f: + i=0 + for node in node_list: + node=str(i)+','+node + f.write(node+"\n") + i+=1 + +def extract_domain_from_address(fromname): + sender_domain=None + if '@' in fromname: + sender_domain = fromname.split('@')[-1] + if '>' in sender_domain: + sender_domain = sender_domain.split(">")[0] + if ')' in sender_domain: + sender_domain = sender_domain.split(")")[0] + if ',' in sender_domain: + sender_domain = sender_domain.replace(",", " ") + return sender_domain + +def add_message_id_edge(email_folder,edge_file): + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + mail = parseEml(email_folder+"/"+file) + fromname = mail.get_from() + message_id=mail.get_message_id() + if '@' in fromname: + sender_domain = fromname.split('@')[-1] + if '>' in sender_domain: + sender_domain = sender_domain.split(">")[0] + if ')' in sender_domain: + sender_domain = sender_domain.split(")")[0] + if message_id != None: + message_id_domain=message_id.split('@')[-1] + message_id_domain=message_id_domain.split(">")[0] + if sender_domain != message_id_domain and sender_domain: + with open(edge_file, 'a+',encoding='utf-8') as edge_f: + edge_f.write(sender_domain+","+message_id_domain+",2\n") + +def add_x_mailer_edge(email_folder,edge_file): + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + mail = parseEml(email_folder+"/"+file) + fromname = mail.get_from() + x_mailer=mail.get_x_mailer() + if x_mailer: + x_mailer=x_mailer.replace("\n","") + x_mailer=x_mailer.replace(",","") + if '@' in fromname: + sender_domain = fromname.split('@')[-1] + if '>' in sender_domain: + sender_domain = sender_domain.split(">")[0] + if ')' in sender_domain: + sender_domain = sender_domain.split(")")[0] + if x_mailer != None and sender_domain: + with open(edge_file, 'a+',encoding="utf-8") as edge_f: + edge_f.write(sender_domain+","+x_mailer+",3\n") + +def add_dkim_edge(email_folder,edge_file): + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + mail = parseEml(email_folder+"/"+file) + fromname = mail.get_from() + dkim_signature=mail.get_dkim() + if dkim_signature: + dkim_signature=dkim_signature.replace("\n\t","") + dkim_domains=re.findall(r'd=(.+?);',dkim_signature) + if len(dkim_domains)==0: + # dkim_domain=dkim_domains[0] + print(dkim_signature) + else: + dkim_domain=dkim_domains[0] + if '@' in fromname: + sender_domain = fromname.split('@')[-1] + if '>' in sender_domain: + sender_domain = sender_domain.split(">")[0] + if ')' in sender_domain: + sender_domain = sender_domain.split(")")[0] + if sender_domain and sender_domain != dkim_domain: + with open(edge_file, 'a+', encoding="utf-8") as edge_f: + edge_f.write(sender_domain + "," + dkim_domain + ",4\n") + + +import csv +def add_nodes(node_file,edge_file,new_node_file): + nodes_set=set() + # 逐行读取csv文件 + with open(node_file, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + nodes_set.add(node["name"]+","+node["type"]) + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges=csv.DictReader(edgefile) + for edge in edges: + if edge["type"]=='2' or edge["type"]=='4': + nodes_set.add(edge["node2"]+","+str(1)) + else: + nodes_set.add(edge["node2"]+","+str(3)) + with open(new_node_file, 'a+',encoding="utf-8") as f: + f.write('index,name,type\n') + i = 0 + for new_node in nodes_set: + new_node = str(i) + ',' + new_node + f.write(new_node + "\n") + i += 1 + +def is_ip(str): + domain_and_ip=re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+',str) + domain=re.findall(r'[-a-zA-Z-]',str) + if len(domain_and_ip) and (len(domain)==0): + nums=str.split(".") + if len(nums)==4: + return True + return False + +import pandas as pd +def nodes_to_index(node_file,edge_file,new_edge_file): + nodes=pd.read_csv(node_file,encoding='utf-8') + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + if edge['type']=='0': + print("hi:"+edge['node1']) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0] + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0] + elif edge['type']=='1':#注意区分域名和IP + if is_ip(edge['node1']): + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==2)].index.tolist()[0] + else: + print(edge["node1"]) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==1)].index.tolist()[0] + if is_ip(edge['node2']): + print(edge["node2"]) + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==2)].index.tolist()[0] + else: + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0] + elif edge['type']=='2' or edge['type'] == '4': + node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0] + node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 1)].index.tolist()[0] + elif edge['type']=='3': + node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0] + node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0] + edge_list.append(str(node1_index)+","+str(node2_index)+","+str(edge['type'])) + with open(new_edge_file, 'a+', encoding="utf-8") as f: + for new_edge in edge_list: + f.write(new_edge + "\n") + +def nodes_to_index_mes_id(node_file,edge_file,new_edge_file): + nodes=pd.read_csv(node_file,encoding='utf-8') + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + print(edge["node1"]) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0] + if edge['type']=='2' or edge['type'] == '4': + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0] + elif edge['type']=='3': + node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0] + edge_list.append(str(node1_index)+","+str(node2_index)+","+str(edge['type'])) + with open(new_edge_file, 'w', encoding="utf-8") as f: + for new_edge in edge_list: + f.write(new_edge + "\n") + +# 使用 graphviz创建元图. +# import pygraphviz as pgv +import json +def plot_graph(node_file,edge_file_fraud,edge_file_legi): + ag = pgv.AGraph(strict=False, directed=False,rankdir="LR") + with open(edge_file_fraud, 'r', encoding="utf-8") as edgefile_fraud: + reader=csv.reader(edgefile_fraud) + edges_fraud=[" ".join(row) for row in reader] + edge_count_fraud=pd.value_counts(edges_fraud).to_dict() + with open(edge_file_legi, 'r', encoding="utf-8") as edgefile_legi: + reader1=csv.reader(edgefile_legi) + edges_legi=[" ".join(row) for row in reader1] + edge_count_legi=pd.value_counts(edges_legi).to_dict() + with open(node_file, 'r', encoding="utf-8") as nodefile: + nodes = csv.DictReader(nodefile) + for node in nodes: + if node["type"] == '0': + ag.add_node(node["index"], label=node["name"], shape="box", color="blue") + # ag.add_node(node["index"], shape="box",color="blue") + # ag.add_node(node["index"], shape="point", color="blue") + elif node["type"] == '1': + ag.add_node(node["index"], label=node["name"], shape="ellipse") + # ag.add_node(node["index"], shape="ellipse") + # ag.add_node(node["index"], shape="point",color="green") + elif node["type"] == '2': + ag.add_node(node["index"], shape="point") + else: + ag.add_node(node["index"], label=node["name"], shape="diamond") + # ag.add_node(node["index"], shape="diamond") + # ag.add_node(node["index"], shape="point", color="purple") + for key in edge_count_fraud: + edge_param=key.split(" ") + ag.add_edge(edge_param[0],edge_param[1],label=edge_count_fraud[key],color="red") + for key in edge_count_legi: + edge_param=key.split(" ") + ag.add_edge(edge_param[0], edge_param[1], label=edge_count_legi[key]) + ag.layout('dot') + ag.draw('graph_dot.svg') + +def select_legi_emails(email_folder): + files = os.listdir(email_folder) + i=0 + for file in files: # 遍历文件夹 + if i<2483: + copyfile(email_folder + "/" + file,"datacon_1_legi_train/"+file) + if i>=2483 and i < 3725: + copyfile(email_folder + "/" + file,"datacon_1_legi_val/"+file) + if i>=3725: + copyfile(email_folder + "/" + file,"datacon_1_legi_test/"+file) + i += 1 + +def merge_node(node_file1,node_file2,new_node_file): + #合并两个node文件,统一索引 + nodes_set = set() + # 逐行读取csv文件 + with open(node_file1, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + nodes_set.add(node["name"] + "," + node["type"]) + with open(node_file2, 'r', encoding="utf-8") as nodefile2: + nodes2 = csv.DictReader(nodefile2) + for node2 in nodes2: + nodes_set.add(node2["name"] + "," + node2["type"]) + with open(new_node_file, 'a+', encoding="utf-8") as f: + f.write('index,name,type\n') + i = 0 + for new_node in nodes_set: + new_node = str(i) + ',' + new_node + f.write(new_node + "\n") + i += 1 + +import json + +def _str2tuple(key): + # 注意python切片 左开右闭 的性质 + fore = int(key[1:2]) + back = key[5: -2] + return tuple([fore, back]) + + +def one_email_to_graph(email_path,node_file,edge_file): + with open(node_file, 'r',encoding='UTF-8') as node_f: + node_dict = json.load(node_f) + node_dict = json.loads(node_dict) + node_dict = {_str2tuple(k): node_dict[k] for k in node_dict} + edge_list = [] + mail = parseEml(email_path) + raw_node_list = mail.get_from_host_list() + raw_node_list.insert(0, mail.get_from()) + # print(raw_node_list) + inter_node_list = [] + sender_domain = None + for node in raw_node_list: + if '@' in node: + node = node.split('@')[-1] + if '>' in node: + node = node.split(">")[0] + if ')' in node: + node = node.split(")")[0] + if ',' in node: + node = node.replace(",", " ") + sender_domain = node + if "monkey.org\n" in sender_domain: + print(email_path) + if (0,node) not in node_dict: + node_dict[(0,node)]=len(node_dict) + else: + inter_domain_ip = set() + inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node) + # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node) + if len(inter_nodes) != 0: + for inter_node in inter_nodes: + if len(re.findall(r'[-a-zA-Z]', inter_node)): + domain_sets = inter_node.split(".") + if len(domain_sets) > 2: + inter_node = ".".join(domain_sets[1:]) + if (1, inter_node) not in node_dict: + node_dict[(1, inter_node)] = len(node_dict) + if sender_domain: + edge_list.append(str(node_dict[(0,sender_domain)]) + "," + str(node_dict[(1,inter_node)]) + ",0") # 发件域到中间域的边 + inter_domain_ip.add((1,inter_node)) + else: + inter_domain_ip.add((2,inter_node)) + if (2, inter_node) not in node_dict: + node_dict[(2, inter_node)] = len(node_dict) + if len(inter_domain_ip): + inter_node_list.append(inter_domain_ip) + # print(node_list) + # print(sender_domain) + # print(inter_node_list) + for domain_ip_set in inter_node_list: + if len(domain_ip_set) > 1: + domain_ip_list = list(domain_ip_set) + for i in range(0, len(domain_ip_list) - 1): + for j in range(i + 1, len(domain_ip_list)): + edge_list.append(str(node_dict[domain_ip_list[i]]) + "," + str(node_dict[domain_ip_list[j]]) + ",1") + print(edge_list) + + #message-id + message_id = mail.get_message_id() + if message_id != None: + message_id_domain = message_id.split('@')[-1] + message_id_domain = message_id_domain.split(">")[0] + if sender_domain != message_id_domain and sender_domain: + if (1,message_id_domain ) not in node_dict: + node_dict[(1, message_id_domain)] = len(node_dict) + edge_list.append(str(node_dict[(0,sender_domain)]) + "," + str(node_dict[(1,message_id_domain)]) + ",2") + + #x-mailer + x_mailer = mail.get_x_mailer() + if x_mailer: + x_mailer = x_mailer.replace("\n", "") + x_mailer = x_mailer.replace(",", "") + if x_mailer != None and sender_domain: + if (3, x_mailer) not in node_dict: + node_dict[(3, x_mailer)] = len(node_dict) + edge_list.append(str(node_dict[(0, sender_domain)]) + "," + str(node_dict[(3, x_mailer)]) + ",3") + + #dkim-domain + dkim_signature = mail.get_dkim() + if dkim_signature: + dkim_signature = dkim_signature.replace("\n\t", "") + dkim_domains = re.findall(r'd=(.+?);', dkim_signature) + if len(dkim_domains) == 0: + # dkim_domain=dkim_domains[0] + print(dkim_signature) + else: + dkim_domain = dkim_domains[0] + if sender_domain and sender_domain != dkim_domain: + if (1, dkim_domain) not in node_dict: + node_dict[(1, dkim_domain)] = len(node_dict) + edge_list.append(str(node_dict[(0, sender_domain)]) + "," + str(node_dict[(1, dkim_domain)]) + ",4") + + with open(node_file, 'w', encoding="utf-8") as f: + node_dict=json.dumps({str(k):node_dict[k] for k in node_dict}) + json.dump(node_dict,f) + with open(edge_file,'a+',encoding="utf-8") as edge_f: + for edge in edge_list: + edge_f.writelines(edge) + edge_f.writelines("\n") + +def email_batch_to_graph(email_folder,node_file,edge_file): + node_list = set() + with open(node_file, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + node_list.add(node["name"]+","+node["type"]) + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + nodes, edges = one_email_to_edges(email_folder + "/" + file) + node_list.update(nodes) + with open(edge_file, 'a+', encoding="utf-8") as edge_f: + for edge in edges: + edge_f.write(edge + "\n") + with open(node_file, 'w', encoding="utf-8") as f: + f.write("index,name,type\n") + i = 0 + for node in node_list: + node = str(i) + ',' +node + f.write(node + "\n") + i += 1 + +def one_email_to_edges(email_path): + node_set=set() + edge_list = [] + mail = parseEml(email_path) + raw_node_list = mail.get_from_host_list() + if raw_node_list == None: + raw_node_list=[] + if mail.get_from() != None: + # print(mail.get_from()) + raw_node_list.insert(0, mail.get_from()) + # print(raw_node_list) + inter_node_list = [] + sender_domain = None + for node in raw_node_list: + if '@' in node: + node = node.split('@')[-1] + if '>' in node: + node = node.split(">")[0] + if ')' in node: + node = node.split(")")[0] + if ',' in node: + node = node.replace(",", " ") + if '\n' in node: + node = node.replace("\n"," ") + sender_domain = node + # if "\n" in sender_domain: + # print(email_path) + node_set.add(node+",0") + else: + inter_domain_ip = set() + inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node) + # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node) + if len(inter_nodes) != 0: + for inter_node in inter_nodes: + if len(re.findall(r'[-a-zA-Z]', inter_node)): + domain_sets = inter_node.split(".") + if len(domain_sets) > 2: + inter_node = ".".join(domain_sets[1:]) + node_set.add(inter_node+",1") + if sender_domain: + edge_list.append(sender_domain + "," + inter_node + ",0") # 发件域到中间域的边 + inter_domain_ip.add((1,inter_node)) + else: + inter_domain_ip.add((2,inter_node)) + node_set.add(inter_node+",2") + if len(inter_domain_ip): + inter_node_list.append(inter_domain_ip) + # print(node_list) + # print(sender_domain) + # print(inter_node_list) + for domain_ip_set in inter_node_list: + if len(domain_ip_set) > 1: + domain_ip_list = list(domain_ip_set) + for i in range(0, len(domain_ip_list) - 1): + for j in range(i + 1, len(domain_ip_list)): + edge_list.append(domain_ip_list[i][1] + "," + domain_ip_list[j][1] + ",1") + # print(edge_list) + + #message-id + message_id = mail.get_message_id() + if message_id != None: + message_id_domain = message_id.split('@')[-1] + message_id_domain = message_id_domain.split(">")[0] + if sender_domain != message_id_domain and sender_domain: + node_set.add(message_id_domain+",1") + edge_list.append(sender_domain + "," + message_id_domain + ",2") + + #x-mailer + x_mailer = mail.get_x_mailer() + if x_mailer: + x_mailer = x_mailer.replace("\n", "") + x_mailer = x_mailer.replace(",", "") + if x_mailer != None and sender_domain: + node_set.add(x_mailer+",3") + edge_list.append(sender_domain + "," + x_mailer + ",3") + + #dkim-domain + dkim_signature = mail.get_dkim() + if dkim_signature: + dkim_signature = dkim_signature.replace("\n\t", "") + dkim_domains = re.findall(r'd=(.+?);', dkim_signature) + if len(dkim_domains) == 0: + # dkim_domain=dkim_domains[0] + print(dkim_signature) + else: + dkim_domain = dkim_domains[0] + if sender_domain and sender_domain != dkim_domain: + node_set.add(dkim_domain+",1") + edge_list.append(sender_domain + "," + dkim_domain + ",4") + return node_set,edge_list + +def split_training_nodes(node_file,edge_file): + node_dataframe_all=pd.read_csv(node_file,encoding="utf-8") + edge_dataframe_all=pd.read_csv(edge_file,encoding="utf-8") + nodes_list=edge_dataframe_all["node1"].tolist() + nodes_list+=edge_dataframe_all["node2"].tolist() + nodes_set=set(nodes_list) + print(len(nodes_set)) + training_nodes=node_dataframe_all[node_dataframe_all["index"].isin(nodes_list)] + # training_nodes.to_csv("training_nodes.csv",index=False) + +def add_testing_nodes(node_file1,node_file2,added_nodes_file): + nodes_set = set() + new_node_dict={} + # 逐行读取csv文件 + with open(node_file1, 'r', encoding="utf-8") as nodefile1: + nodes = csv.DictReader(nodefile1) + for node in nodes: + training_node=node["name"] + "," + node["type"] + # if training_node in nodes_set: + # print(training_node) + nodes_set.add(training_node) + with open(node_file2, 'r', encoding="utf-8") as nodefile2: + nodes2 = csv.DictReader(nodefile2) + for node2 in nodes2: + test_node=node2["name"]+","+node2["type"] + if test_node in nodes_set: + continue + new_node_dict[len(nodes_set)]=test_node + nodes_set.add(test_node) + with open(added_nodes_file, 'w', encoding="utf-8") as f: + f.write("index,name,type\n") + for key in new_node_dict: + node = str(key) + ',' +new_node_dict[key] + f.write(node + "\n") + +if __name__ == "__main__": + # select_legi_emails("datacon_1_legitimate") + # extract_sender_and_received("datacon_1_fraud","datacon_fraud_graph/nodes.csv","datacon_fraud_graph/edges.csv") + # add_message_id_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv") + # add_x_mailer_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv") + # add_dkim_edge("datacon_1_fraud","datacon_fraud_graph/edges_other.csv") + # add_nodes("datacon_fraud_graph/nodes.csv","datacon_fraud_graph/edges_other.csv","datacon_fraud_graph/nodes_all.csv") + # merge_node("datacon_legitimate_graph/nodes_all.csv","all_nodes.csv","all_nodes1.csv") + # nodes_to_index("all_nodes.csv","legi_edges_testing.csv","legi_edges_testing_index_only.csv") + # nodes_to_index_mes_id("all_nodes.csv","datacon_legitimate_graph/edges_other.csv","datacon_legitimate_graph/edges_index_only.csv") + # plot_graph("all_nodes.csv","fraud_edges_index_only.csv","datacon_legitimate_graph/legi_edges_index_only.csv") + # one_email_to_graph("nazario_phishing_2020/2.eml","all_nodes.json","all_edges.csv") + email_batch_to_graph("benign_emails","all_nodes1.csv","benign_edges.csv") + # split_training_nodes("all_nodes.csv","edges_training_index_only.csv") + # add_testing_nodes("training_nodes1.csv","testing_nodes.csv","indexed_testing_nodes.csv")
\ No newline at end of file diff --git a/code/buildGraph2.py b/code/buildGraph2.py new file mode 100644 index 0000000..b994ec8 --- /dev/null +++ b/code/buildGraph2.py @@ -0,0 +1,106 @@ +import re +from parseEml import parseEml +import csv,os + +def one_email_to_edges(email_path,email_num): + node_set=set() + edge_list = [] + mail = parseEml(email_path) + raw_node_list = mail.get_from_host_list() + if raw_node_list == None: + raw_node_list=[] + raw_node_list.append("email"+str(email_num)) + email_node="email"+str(email_num) + node_set.add(email_node+",0") + other_node_set=set() + inter_node_list = [] + for node in raw_node_list: + inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node) + # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node) + if len(inter_nodes) != 0: + for inter_node in inter_nodes: + if len(re.findall(r'[-a-zA-Z]', inter_node)): + domain_sets = inter_node.split(".") + if len(domain_sets) > 2: + inter_node = ".".join(domain_sets[1:]) + node_set.add(inter_node+",1") + other_node_set.add(inter_node) + else: + node_set.add(inter_node+",2") + other_node_set.add(inter_node) + + #message-id + message_id = mail.get_message_id() + if message_id != None: + message_id_domain = message_id.split('@')[-1] + message_id_domain = message_id_domain.split(">")[0] + node_set.add(message_id_domain+",1") + other_node_set.add(message_id_domain) + + #x-mailer + x_mailer = mail.get_x_mailer() + if x_mailer: + x_mailer = x_mailer.replace("\n", "") + x_mailer = x_mailer.replace(",", "") + if x_mailer != None: + node_set.add(x_mailer+",3") + other_node_set.add(x_mailer) + + #dkim-domain + dkim_signature = mail.get_dkim() + if dkim_signature: + dkim_signature = dkim_signature.replace("\n\t", "") + dkim_domains = re.findall(r'd=(.+?);', dkim_signature) + if len(dkim_domains) == 0: + # dkim_domain=dkim_domains[0] + print(dkim_signature) + else: + dkim_domain = dkim_domains[0] + node_set.add(dkim_domain+",1") + other_node_set.add(dkim_domain) + for other_node in other_node_set: + edge_list.append(email_node + "," + other_node) + return node_set,edge_list + +def email_batch_to_graph(email_folder,node_file,edge_file): + node_list = set() + with open(node_file, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + node_list.add(node["name"]+","+node["type"]) + files = os.listdir(email_folder) + email_num=2010 + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + nodes, edges = one_email_to_edges(email_folder + "/" + file,email_num) + node_list.update(nodes) + with open(edge_file, 'a+', encoding="utf-8") as edge_f: + for edge in edges: + edge_f.write(edge + "\n") + email_num+=1 + with open(node_file, 'w', encoding="utf-8") as f: + f.write("index,name,type\n") + i = 0 + for node in node_list: + node = str(i) + ',' +node + f.write(node + "\n") + i += 1 + +import pandas as pd +def nodes_to_index(node_file,edge_file,new_edge_file): + nodes=pd.read_csv(node_file,encoding='utf-8') + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + node1_index=nodes[(nodes['name']==edge['node1'])].index.tolist()[0] + node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0] + edge_list.append(str(node1_index)+","+str(node2_index)) + with open(new_edge_file, 'a+', encoding="utf-8") as f: + for new_edge in edge_list: + f.write(new_edge + "\n") + +if __name__=="__main__": + # email_batch_to_graph("datacon_1_legitimate","hunter_node.csv","hunter_edge.csv") + nodes_to_index("hunter_node.csv","hunter_edge.csv","hunter_edge_index_only.csv")
\ No newline at end of file diff --git a/code/buildGraphviz.py b/code/buildGraphviz.py new file mode 100644 index 0000000..cf43159 --- /dev/null +++ b/code/buildGraphviz.py @@ -0,0 +1,24 @@ +from graphviz import Digraph + +# 实例化一个Digraph对象(有向图),name:生成的图片的图片名,format:生成的图片格式 +dot = Digraph(name="MyPicture", comment="the test", format="png") + +# 生成图片节点,name:这个节点对象的名称,label:节点名,color:画节点的线的颜色 +dot.node(name='a', label='Ming', color='green') +dot.node(name='b', label='Hong', color='yellow') +dot.node(name='c', label='Dong') + +# 在节点之间画线,label:线上显示的文本,color:线的颜色 +dot.edge('a', 'b', label="ab\na-b", color='red') +# 一次性画多条线,c到b的线,a到c的线 +dot.edges(['cb', 'ac']) + +# 打印生成的源代码 +print(dot.source) + +# 画图,filename:图片的名称,若无filename,则使用Digraph对象的name,默认会有gv后缀 +# directory:图片保存的路径,默认是在当前路径下保存 +dot.view(filename="mypicture") + +# 跟view一样的用法(render跟view选择一个即可),一般用render生成图片,不使用view=True,view=True用在调试的时候 +# dot.render(filename='MyPicture')
\ No newline at end of file diff --git a/code/buildSubGraph.py b/code/buildSubGraph.py new file mode 100644 index 0000000..e68cc82 --- /dev/null +++ b/code/buildSubGraph.py @@ -0,0 +1,72 @@ +from buildGraph import one_email_to_edges,is_ip +import csv +import os +import pandas as pd + +def nodes_to_index(node_file,edge_file,new_edge_file): + print(str(new_edge_file)) + nodes=pd.read_csv(node_file,encoding='utf-8') + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + if edge['type']=='0': + print("hi:"+edge['node1']) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==0)].index.tolist()[0] + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0] + elif edge['type']=='1':#注意区分域名和IP + if is_ip(edge['node1']): + print(edge['node1']) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==2)].index.tolist()[0] + else: + print(edge["node1"]) + node1_index=nodes[(nodes['name']==edge['node1']) & (nodes['type']==1)].index.tolist()[0] + if is_ip(edge['node2']): + print(edge["node2"]) + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==2)].index.tolist()[0] + else: + node2_index=nodes[(nodes['name']==edge['node2']) & (nodes['type']==1)].index.tolist()[0] + elif edge['type']=='2' or edge['type'] == '4': + node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0] + node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 1)].index.tolist()[0] + elif edge['type']=='3': + node1_index = nodes[(nodes['name'] == edge['node1']) & (nodes['type'] == 0)].index.tolist()[0] + node2_index = nodes[(nodes['name'] == edge['node2']) & (nodes['type'] == 3)].index.tolist()[0] + edge_list.append(str(node1_index)+"\t"+str(node2_index)) + with open(new_edge_file, 'w', encoding="utf-8") as f: + for new_edge in edge_list: + f.writelines(new_edge + "\n") + +def email_batch_to_subgraph(email_folder,node_file,graph_folder): + # node_dict = {} + # with open(node_file, 'r', encoding="utf-8") as csvfile: + # nodes = csv.DictReader(csvfile) + # for node in nodes: + # node_dict[node["name"]+","+node["type"]]=node["index"] + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + nodes, edges = one_email_to_edges(email_folder + "/" + file) + with open(graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv", 'w', encoding="utf-8") as edge_f: + edge_f.write("node1,node2,type\n") + for edge in edges: + edge_f.write(edge + "\n") + nodes_to_index(node_file,graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv",graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')) + os.remove(graph_folder+"/"+email_folder+"_"+file.replace(".eml",'')+".csv") + +def find_differ_inter_domain(email_folder,inter_domain_file): + files = os.listdir(email_folder) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + nodes, edges = one_email_to_edges(email_folder + "/" + file) + for edge in edges: + edge_part=edge.split(",") + if(edge_part[2]==("0" or "2" or "4")): + if(edge_part[0]!=edge_part[1]): + with open(inter_domain_file,'a+',encoding="utf-8") as f: + f.write(edge_part[1]+"\n"); + +if __name__=="__main__": + find_differ_inter_domain("nazario_phishing_2021","inter_domain.txt")
\ No newline at end of file diff --git a/code/doStatistics.py b/code/doStatistics.py new file mode 100644 index 0000000..ef455f5 --- /dev/null +++ b/code/doStatistics.py @@ -0,0 +1,25 @@ +import csv +import pandas as pd + +def edge_count(edge_file): + with open(edge_file, 'r', encoding="utf-8") as edgefile: + reader=csv.reader(edgefile) + edges_fraud=[" ".join(row) for row in reader] + edge_count_fraud=pd.value_counts(edges_fraud).to_dict() + +def count_multi_set_nodes(node_file1,node_file2): + # 合并两个node文件,统一索引 + nodes_set = set() + # 逐行读取csv文件 + with open(node_file1, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + nodes_set.add(node["name"] + "," + node["type"]) + with open(node_file2, 'r', encoding="utf-8") as nodefile2: + nodes2 = csv.DictReader(nodefile2) + for node2 in nodes2: + nodes_set.add(node2["name"] + "," + node2["type"]) + print(len(nodes_set)) + +if __name__ == "__main__": + count_multi_set_nodes("datacon_fraud_graph/nodes_all.csv","nazario_2021_graph/nodes_all.csv")
\ No newline at end of file diff --git a/code/drawGraph.py b/code/drawGraph.py new file mode 100644 index 0000000..5650145 --- /dev/null +++ b/code/drawGraph.py @@ -0,0 +1,81 @@ +import networkx as nx +import csv +import pandas as pd +import dgl + +def read_graph_networkx(node_file,edge_file): + G = nx.Graph() + node_list=[] + edge_list=[] + with open(node_file, 'r', encoding="utf-8") as csvfile: + nodes = csv.DictReader(csvfile) + for node in nodes: + node_list.append((node["index"],{"label":node["name"]})) + G.add_nodes_from(node_list) + with open(edge_file, 'r', encoding="utf-8") as edgefile: + reader1 = csv.reader(edgefile) + edges = [" ".join(row) for row in reader1] + edge_count = pd.value_counts(edges).to_dict() + for key in edge_count: + edge_param=key.split(" ") + edge_list.append((edge_param[0],edge_param[1])) + G.add_edges_from(edge_list) + print(G) + + # nx.draw(G,font_size=20) + # plt.show() + + subgraphs=nx.connected_components(G) + for c in sorted(subgraphs,key=len,reverse=True): + if len(c)<=10: + break + print(G.subgraph(c)) + +# nx.draw(largest_connected_subgraph,with_labels=True) + +def read_graph_dgl(node_file,edge_file_fraud,edge_file_legi): + graph_dict={} + node_dict={} + with open(node_file, 'r', encoding="utf-8") as nodefile: + nodes = csv.DictReader(nodefile) + for node in nodes: + if node["type"] == '0': + node_dict[node["index"]]="sender_domain" + elif node["type"] == '1': + node_dict[node["index"]]="inter_domain" + elif node["type"] == '2': + node_dict[node["index"]]="IP" + else: + node_dict[node["index"]]="client" + + with open(edge_file_fraud, 'r', encoding="utf-8") as edgefile: + reader1 = csv.reader(edgefile) + edges = [" ".join(row) for row in reader1] + edge_count = pd.value_counts(edges).to_dict() + for key in edge_count: + edge_param=key.split(" ") + if graph_dict[(node_dict[edge_param[0]],"fraud",node_dict[edge_param[1]])]: + graph_dict[(node_dict[edge_param[0]], "fraud", node_dict[edge_param[1]])].append((edge_param[0],edge_param[1])) + else: + graph_dict[(node_dict[edge_param[0]], "fraud", node_dict[edge_param[1]])]=[(edge_param[0],edge_param[1])] + # 字典的每个值都是一个元组的列表。 + # 节点是从零开始的整数ID。 不同类型的节点ID具有单独的计数。 + ratings = dgl.heterograph(graph_dict) + +import torch as th +def test(): + # 边 0->1, 0->2, 0->3, 1->3 + u, v = th.tensor(["qq.com","113.108.11.234","qq.com"]), th.tensor(["qq.com", "qq.com","127.0.0.1"]) + g = dgl.graph((u, v)) + print(g) # 图中节点的数量是DGL通过给定的图的边列表中最大的点ID推断所得出的 + # 获取节点的ID + print(g.nodes()) + # 获取边的对应端点 + print(g.edges()) + # 获取边的对应端点和边ID + print(g.edges(form='all')) + +if __name__=="__main__": + # read_graph_dgl("all_nodes.csv","fraud_edges_index_only.csv","") + # read_graph_networkx("all_nodes.csv","all_edges_index_only.csv") + test()
\ No newline at end of file diff --git a/code/edgeClassification.py b/code/edgeClassification.py new file mode 100644 index 0000000..abc7af3 --- /dev/null +++ b/code/edgeClassification.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- +"""edgeClassification.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6 +""" + +import dgl +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import dgl.function as fn +import dgl.nn.pytorch as dglnn +import time +import argparse +from dgl.data import * +import tqdm +from sklearn import metrics + +import warnings +warnings.filterwarnings('ignore') + +src = np.random.randint(0, 100, 500) +dst = np.random.randint(0, 100, 500) +# 同时建立反向边 +edge_pred_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src]))) +# 建立点和边特征,以及边的标签 +edge_pred_graph.ndata['feature'] = th.randn(100, 10) +edge_pred_graph.edata['feature'] = th.randn(1000, 10) +edge_pred_graph.edata['label'] = th.randn(1000) +# 进行训练、验证和测试集划分 +edge_pred_graph.edata['train_mask'] = th.zeros(1000, dtype=th.bool).bernoulli(0.6) + +import dgl.function as fn +class DotProductPredictor(nn.Module): + def forward(self, graph, h): + # h是计算出的节点表示 + with graph.local_scope(): + graph.ndata['h'] = h + graph.apply_edges(fn.u_dot_v('h', 'h', 'score')) + return graph.edata['score'] + +class TwoLayerGCN(nn.Module): + def __init__(self, in_dim, hid_dim, out_dim): + """两层的GCN模型""" + super().__init__() + self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True) + self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True) + + def forward(self, blocks, x): + x = F.relu(self.conv1(blocks[0], x)) + x = F.relu(self.conv2(blocks[1], x)) + return x + +class SAGE(nn.Module): + def __init__(self, in_feats, hid_feats, out_feats): + super().__init__() + # 实例化SAGEConve,in_feats是输入特征的维度,out_feats是输出特征的维度,aggregator_type是聚合函数的类型 + self.conv1 = dglnn.SAGEConv( + in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean') + self.conv2 = dglnn.SAGEConv( + in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean') + + def forward(self, graph, inputs): + # 输入是节点的特征 + h = self.conv1(graph, inputs) + h = F.relu(h) + h = self.conv2(graph, h) + return h + +class Model(nn.Module): + def __init__(self, in_features, hidden_features, out_features): + super().__init__() + self.sage=SAGE(in_features,hidden_features,out_features) + self.pred = DotProductPredictor() + def forward(self, g, x): + h = self.sage(g,x) + return self.pred(g, h) + +node_features = edge_pred_graph.ndata['feature'] +edge_label = edge_pred_graph.edata['label'] +train_mask = edge_pred_graph.edata['train_mask'] +model = Model(10, 20, 5) +opt = th.optim.Adam(model.parameters()) +for epoch in range(10): + pred = model(edge_pred_graph, node_features) + loss = ((pred[train_mask] - edge_label[train_mask]) ** 2).mean() + opt.zero_grad() + loss.backward() + opt.step() + print(loss.item()) + +argparser = argparse.ArgumentParser("edge classification training") +argparser.add_argument('--num-epochs', type=int, default=30) +argparser.add_argument('--num-hidden', type=int, default=64) +argparser.add_argument('--num-layers', type=int, default=2) +argparser.add_argument('--fan-out', type=str, default='10,25') ## 第一层随机采样10个,第二层采样25个 +argparser.add_argument('--batch-size', type=int, default=2048) +argparser.add_argument('--lr', type=float, default=0.005) +argparser.add_argument('--dropout', type=float, default=0.5) +argparser.add_argument('--num-workers', type=int, default=0, + help="Number of sampling processes. Use 0 for no extra process.") +args = argparser.parse_args([]) + +# load wordnet data +data = WN18Dataset() +n_classes = data.num_rels ## 关系数量,也就是边分类的分类数 +g = data[0] +## 训练集、验证集、测试集 +train_edge_mask = g.edata.pop('train_edge_mask') +val_edge_mask = g.edata.pop('valid_edge_mask') +test_edge_mask = g.edata.pop('test_edge_mask') + +# Pack data +data = train_edge_mask, val_edge_mask, test_edge_mask, n_classes, g +print('\n', g) +n_edges = g.num_edges() # 图中边的数量 +labels = g.edata['etype'] # 图中所有边的标签 + + +train_edge_mask, val_edge_mask, test_edge_mask, n_classes, g = data +print('\n', train_edge_mask.sum(), val_edge_mask.sum(), test_edge_mask.sum()) + +## train, valid, test 边的id列表 +train_eid = th.LongTensor(np.nonzero(train_edge_mask)).squeeze() +val_eid = th.LongTensor(np.nonzero(val_edge_mask)).squeeze() +test_eid = th.LongTensor(np.nonzero(test_edge_mask)).squeeze() +print(train_eid.shape, val_eid.shape, test_eid.shape) + +# Create sampler +sampler = dgl.dataloading.MultiLayerNeighborSampler( + [int(fanout) for fanout in args.fan_out.split(',')]) +dataloader = dgl.dataloading.EdgeDataLoader( + g, train_eid, sampler, + exclude='reverse_id', # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊” + # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. + reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), + batch_size=args.batch_size, + shuffle=True, + drop_last=False, + num_workers=args.num_workers) + +## For debug +for dd in dataloader: + """分别是input_nodes, edge_subgraph,blocks""" + for xx in dd: + print(xx) + print() + break + +class Predictor(nn.Module): + """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出""" + + def __init__(self, in_dim, num_classes): + super().__init__() + self.W = nn.Linear(2 * in_dim, num_classes) + + def apply_edges(self, edges): + data = th.cat([edges.src['x'], edges.dst['x']], dim=-1) + return {'score': self.W(data)} + + def forward(self, edge_subgraph, x): + with edge_subgraph.local_scope(): + edge_subgraph.ndata['x'] = x + edge_subgraph.apply_edges(self.apply_edges) + return edge_subgraph.edata['score'] + + +class MyModel(nn.Module): + """主模型:结构比较清晰""" + + def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes): + super().__init__() + self.node_emb = nn.Embedding(num_nodes, emb_dim) + self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim) + self.predictor = Predictor(out_dim, num_classes) + + def forward(self, edge_subgraph, blocks, input_nodes): + x = self.node_emb(input_nodes) + x = self.gcn(blocks, x) + return self.predictor(edge_subgraph, x) + +#实例化模型,并定义预测函数 +device = th.device("cuda" if th.cuda.is_available() else "cpu") +print("device: {}".format(device)) + +model = MyModel(64, args.num_hidden, args.num_hidden, 18, g.num_nodes()) +model = model.to(device) +loss_fcn = nn.CrossEntropyLoss() # 交叉熵损失 +optimizer = optim.Adam(model.parameters(), lr=args.lr) + + +def predict(model, g, valid_eid, device): + # Create sampler(全采样) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2) + dataloader = dgl.dataloading.EdgeDataLoader( + g, valid_eid, sampler, exclude='reverse_id', + # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. + reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), + batch_size=args.batch_size, + shuffle=False, + drop_last=False, + num_workers=args.num_workers) + + valid_preds = [] + model.eval() + with th.no_grad(): + for input_nodes, edges_subgraph, blocks in dataloader: + edges_subgraph = edges_subgraph.to(device) + blocks = [block.int().to(device) for block in blocks] + pred = model(edges_subgraph, blocks, input_nodes) + pred = pred.cpu().argmax(-1).numpy().tolist() + valid_preds.extend(pred) + return valid_preds + +#训练模型 +best_val_acc = 0 # 记录验证集上的最好效果 +patience = 0 # For early stopping + +# Training loop +for epoch in range(args.num_epochs): + # Loop over the dataloader to sample the computation dependency graph as a list of + # blocks. + start_time = time.time() + all_loss = [] + trn_label, trn_pred = [], [] + n_batches = len(dataloader) + + for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader): + edges_subgraph = edges_subgraph.to(device) + blocks = [block.to(device) for block in blocks] + + # Compute loss and prediction + edge_preds = model(edges_subgraph, blocks, input_nodes) + loss = loss_fcn(edge_preds, edges_subgraph.edata['etype']) # or labels[edges_subgraph.edata['_ID']] + optimizer.zero_grad() + loss.backward() + optimizer.step() + + all_loss.append(loss.item()) + trn_label.extend(edges_subgraph.edata['etype'].cpu().numpy().tolist()) + trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist()) + + if (step + 1) % (n_batches // 10) == 0: + cur_acc = metrics.accuracy_score(trn_label, trn_pred) + print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format( + epoch + 1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time)) + + ## 验证集预测 + val_preds = predict(model, g, val_eid, device) + val_acc = metrics.accuracy_score(labels[val_eid], val_preds) + + if val_acc > best_val_acc: + best_val_acc = val_acc + patience = 0 + th.save(model.state_dict(), "edge_cls_best_model.bin") + else: + patience += 1 + print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format( + val_acc, best_val_acc, time.time() - start_time)) + + ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练 + if patience > 3: + break
\ No newline at end of file diff --git a/code/edgeClassificationEmailGraph.py b/code/edgeClassificationEmailGraph.py new file mode 100644 index 0000000..d5b4c80 --- /dev/null +++ b/code/edgeClassificationEmailGraph.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +"""edgeClassification.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1knU85gMIEeId8DL4gw9VMij_niMggRn6 +""" + +# from google.colab import drive +# drive.mount('/content/drive') +# +# !pip install dgl==0.6.1 + +import dgl +import numpy as np +import pandas as pd +import torch as th +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import dgl.function as fn +import dgl.nn.pytorch as dglnn +import time +import argparse +from dgl.data import * +import tqdm +from sklearn import metrics + +import warnings +warnings.filterwarnings('ignore') + +argparser = argparse.ArgumentParser("edge classification training") +argparser.add_argument('--num-epochs', type=int, default=30) +argparser.add_argument('--num-hidden', type=int, default=64) +argparser.add_argument('--num-layers', type=int, default=2) +argparser.add_argument('--fan-out', type=str, default='10,25') ## 第一层随机采样10个,第二层采样25个 +argparser.add_argument('--batch-size', type=int, default=2048) +argparser.add_argument('--lr', type=float, default=0.005) +argparser.add_argument('--dropout', type=float, default=0.5) +argparser.add_argument('--num-workers', type=int, default=0, + help="Number of sampling processes. Use 0 for no extra process.") +args = argparser.parse_args([]) + +import nltk +nltk.download('punkt') +from nltk.tokenize import word_tokenize + +nodes=pd.read_csv("training_nodes1.csv") +node_names=np.array(nodes)[:,1] +print(len(node_names)) +# Tokenization of each name +tokenized_node_names = [] +for node_name in node_names: + tokenized_node_names.append(word_tokenize(node_name.lower())) +print(tokenized_node_names) + +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_node_names)] +model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100) +''' +vector_size = Dimensionality of the feature vectors. +window = The maximum distance between the current and predicted word within a sentence. +min_count = Ignores all words with total frequency lower than this. +alpha = The initial learning rate. +''' + +node_text_feature_list=[] +for text in tokenized_node_names: + node_text_feature_list.append(model.infer_vector(text)) +print(node_text_feature_list) + +#将节点类型先归一化,然后添加到节点特征中 +node_types=np.array(nodes)[:,2] +type_feature_list=[] +for type_value in node_types: + type_value=int(type_value) + type_value=type_value/3 #min-max归一化 + type_feature_list.append(type_value) +print(type_feature_list) +print(len(type_feature_list)) + +for i in range(0,len(node_text_feature_list)): + node_text_feature_list[i]=np.append(node_text_feature_list[i],type_feature_list[i]) +node_feature_array=node_text_feature_list[0] +for i in range(1,len(node_text_feature_list)): + node_feature_array=np.vstack((node_feature_array,node_text_feature_list[i])) +print(node_feature_array) + +edge_classes = 2 ## 关系数量,也就是边分类的分类数 +fraud_edges=pd.read_csv("fraud_edges_training_index_only.csv") +src=np.array(fraud_edges)[:,0] +dst=np.array(fraud_edges)[:,1] +legi_edges=pd.read_csv("legi_edges_training_index_only.csv") +src=np.concatenate([src,np.array(legi_edges)[:,0]]) +dst=np.concatenate([dst,np.array(legi_edges)[:,1]]) +print(len(src)) +print(len(dst)) +# 同时建立反向边 +email_graph = dgl.graph((np.concatenate([src, dst]), np.concatenate([dst, src]))) +# 建立点和边特征,以及边的标签 +email_graph.ndata['feature'] = th.tensor(node_feature_array) +print(email_graph.ndata['feature']) +fraud_type_array=np.array(fraud_edges)[:,2] +fraud_label_array=np.ones((len(fraud_type_array)))#欺诈为1 +legi_type_array=np.array(legi_edges)[:,2] +type_array=np.concatenate([fraud_type_array,legi_type_array]) +legi_label_array=np.zeros((len(legi_type_array)))#良性为0 +label_array=np.concatenate([fraud_label_array,legi_label_array]) +#为反向边也加上type,label +type_array=np.concatenate([type_array,type_array]) +label_array=np.concatenate([label_array,label_array]) +print(len(label_array)) +edge_feature_array=[] +#边类型归一化 +for edge_type in type_array: + edge_type=edge_type/4 + edge_feature_array.append(edge_type) +# print(len(edge_feature_array)) +email_graph.edata['feature'] = th.tensor(edge_feature_array) +print(email_graph.edata['feature']) +email_graph.edata['label'] = th.LongTensor(label_array) +print(email_graph.edata['label']) +print(email_graph) + +# 随机进行训练、验证和测试集划分 6:2:2 +train_len=int(email_graph.number_of_edges() * 0.6) +val_len=int(email_graph.number_of_edges() * 0.2) +test_len=email_graph.number_of_edges()-train_len-val_len +pre_train=np.zeros((train_len)) +pre_val=np.ones((val_len)) +pre_test=np.linspace(2,100,test_len) +pre_split=np.concatenate([np.concatenate([pre_train,pre_val]),pre_test]) +print(pre_split) +np.random.shuffle(pre_split) +print(pre_split) +train_id_list,val_id_list,test_id_list=[],[],[] +for i in range(0,len(pre_split)): + if pre_split[i]==0: + train_id_list.append(i) + elif pre_split[i]==1: + val_id_list.append(i) + else: + test_id_list.append(i) +train_id,val_id,test_id=th.tensor(train_id_list),th.tensor(val_id_list),th.tensor(test_id_list) +print(train_id.shape,val_id.shape,test_id.shape) + +#采样,然后mini-batch训练 +n_edges=email_graph.num_edges() +# Create sampler +sampler = dgl.dataloading.MultiLayerNeighborSampler( + [int(fanout) for fanout in args.fan_out.split(',')]) +dataloader = dgl.dataloading.EdgeDataLoader( + email_graph, train_id, sampler, + exclude='reverse_id', # 去除反向边,否则模型可能知道存在边的联系,导致模型“作弊” + # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. + reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), + batch_size=args.batch_size, + shuffle=True, + drop_last=False, + num_workers=args.num_workers) + +## For debug +for dd in dataloader: + """分别是input_nodes, edge_subgraph,blocks""" + for xx in dd: + print(xx) + print() + break + +class TwoLayerGCN(nn.Module): + def __init__(self, in_dim, hid_dim, out_dim): + """两层的GCN模型""" + super().__init__() + self.conv1 = dglnn.GraphConv(in_dim, hid_dim, allow_zero_in_degree=True) + self.conv2 = dglnn.GraphConv(hid_dim, out_dim, allow_zero_in_degree=True) + + def forward(self, blocks, x): + x = F.relu(self.conv1(blocks[0], x)) + x = F.relu(self.conv2(blocks[1], x)) + return x + +class Predictor(nn.Module): + """边预测模块,将边两端节点表示拼接,然后接一个线性变换,得到最后的分类表示输出""" + def __init__(self, in_dim, num_classes): + super().__init__() + self.W = nn.Linear(2 * in_dim, num_classes) + + def apply_edges(self, edges): + data = th.cat([edges.src['x'], edges.dst['x']], dim=-1) + return {'score': self.W(data)} + + def forward(self, edge_subgraph, x): + with edge_subgraph.local_scope(): + edge_subgraph.ndata['x'] = x + edge_subgraph.apply_edges(self.apply_edges) + return edge_subgraph.edata['score'] + +class MyModel(nn.Module): + """主模型:结构比较清晰""" + def __init__(self, emb_dim, hid_dim, out_dim, num_classes, num_nodes): + super().__init__() + self.node_emb = nn.Embedding(num_nodes, emb_dim) + self.gcn = TwoLayerGCN(emb_dim, hid_dim, out_dim) + self.predictor = Predictor(out_dim, num_classes) + + def forward(self, edge_subgraph, blocks, input_nodes): + x = self.node_emb(input_nodes) + x = self.gcn(blocks, x) + return self.predictor(edge_subgraph, x) + +device = th.device("cuda" if th.cuda.is_available() else "cpu") +print("device: {}".format(device)) + +model = MyModel(64, args.num_hidden, args.num_hidden, edge_classes, email_graph.num_nodes()) +model = model.to(device) +loss_fcn = nn.CrossEntropyLoss() # 交叉熵损失 +optimizer = optim.Adam(model.parameters(), lr=args.lr) + + +def predict(model, email_graph, val_id, device): + # Create sampler(全采样) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2) + dataloader = dgl.dataloading.EdgeDataLoader( + email_graph, val_id, sampler, exclude='reverse_id', + # For each edge with ID e in dataset, the reverse edge is e ± |E|/2. + reverse_eids=th.cat([th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]), + batch_size=args.batch_size, + shuffle=False, + drop_last=False, + num_workers=args.num_workers) + + valid_preds = [] + model.eval() + with th.no_grad(): + for input_nodes, edges_subgraph, blocks in dataloader: + edges_subgraph = edges_subgraph.to(device) + blocks = [block.int().to(device) for block in blocks] + pred = model(edges_subgraph, blocks, input_nodes) + pred = pred.cpu().argmax(-1).numpy().tolist() + valid_preds.extend(pred) + return valid_preds + +labels = email_graph.edata['label'] # 图中所有边的标签 +best_val_acc = 0 # 记录验证集上的最好效果 +patience = 0 # For early stopping + +# Training loop +for epoch in range(args.num_epochs): + # Loop over the dataloader to sample the computation dependency graph as a list of + # blocks. + start_time = time.time() + all_loss = [] + trn_label, trn_pred = [], [] + n_batches = len(dataloader) + + for step, (input_nodes, edges_subgraph, blocks) in enumerate(dataloader): + edges_subgraph = edges_subgraph.to(device) + blocks = [block.to(device) for block in blocks] + + # Compute loss and prediction + edge_preds = model(edges_subgraph, blocks, input_nodes) + loss = loss_fcn(edge_preds, edges_subgraph.edata['label']) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + all_loss.append(loss.item()) + trn_label.extend(edges_subgraph.edata['label'].cpu().numpy().tolist()) + trn_pred.extend(edge_preds.argmax(-1).detach().cpu().numpy().tolist()) + + if (step+1) % (n_batches//10) == 0: + cur_acc = metrics.accuracy_score(trn_label, trn_pred) + print('Epoch {:2d} | Step {:04d}/{} | Loss {:.4f} | Avg Loss {:.4f} | Acc {:.4f} | Time {:.2f}s'.format( + epoch+1, step, n_batches, loss.item(), np.mean(all_loss), cur_acc, time.time() - start_time)) + + ## 验证集预测 + val_preds = predict(model, email_graph, val_id, device) + val_acc = metrics.accuracy_score(labels[val_id], val_preds) + + if val_acc > best_val_acc: + best_val_acc = val_acc + patience = 0 + th.save(model.state_dict(), "edge_cls_best_model.bin") + else: + patience += 1 + print('Cur Val Acc {:.4f}, Best Val Acc {:.4f}, Time {:.2f}s'.format( + val_acc, best_val_acc, time.time() - start_time)) + + ## earlystopping,如果验证集效果连续三次以上都没上升,直接停止训练 + if patience > 3: + break + +model.load_state_dict(th.load("edge_cls_best_model.bin")) + +test_preds = predict(model, email_graph, test_id, device) +print("test acc: {:.4f}".format(metrics.accuracy_score(labels[test_id], test_preds)))
\ No newline at end of file diff --git a/code/emailPreparation.py b/code/emailPreparation.py new file mode 100644 index 0000000..82f308d --- /dev/null +++ b/code/emailPreparation.py @@ -0,0 +1,57 @@ +from shutil import copyfile +from sys import exit +import os +import sys +import re + +def split_datacon_eml(): + f = open("C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/answer.txt") # 返回一个文件对象 + line = f.readline() # 调用文件的 readline()方法 + fraud_num_list=[] + while line: + fraud_num_list.append(line.strip()) + # print(line, end = '')# 在 Python 3 中使用 + line = f.readline() + + emails = os.listdir("C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/data/") + for email in emails: + email_num=email.replace(".eml", "") + if email_num not in fraud_num_list: + continue + source="C:/Users/gisel/Desktop/datacon 2021/datacon_coremail/1_data/data/"+email_num+".eml" + target="C:/Users/gisel/Desktop/毕设/postPath/datacon_1_fraud/"+email_num+".eml" + + try: + copyfile(source,target) + except IOError as e: + print("Unable to copy file. %s" % e) + exit(1) + except: + print("Unexpected error:", sys.exc_info()) + exit(1) + + print("\nFile copy done!\n") + + f.close() + +def nazario_file_to_eml(text_file,email_folder): + f = open(text_file,'r',encoding='UTF-8',errors='ignore') # 返回一个文件对象 + line = f.readline() # 调用文件的 readline()方法 + num = 0 + while line: + line_list = [] + while re.match("^From ", line) is None and line: + line_list.append(line) + # print(line) + line = f.readline() + # print(line_list) + with open(email_folder + "/" + str(num) + ".eml", 'a+',encoding='utf-8') as f_new: + for i in line_list: + f_new.write(i) + num += 1 + line = f.readline() + + f.close() + +if __name__ == "__main__": + nazario_file_to_eml("nazario_phishing_2015.txt","nazario_phishing_2015")
\ No newline at end of file diff --git a/code/graphsage.py b/code/graphsage.py new file mode 100644 index 0000000..1ba5d5a --- /dev/null +++ b/code/graphsage.py @@ -0,0 +1,390 @@ +# -*- coding: utf-8 -*- +"""GraphSAGE.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1oLF3FCe1CYDohND_VJ6unSXqLRaJ5mi3 +""" + +# from google.colab import drive +# drive.mount('/content/drive') + +#采样sampling +import numpy as np + + +def sampling(src_nodes, sample_num, neighbor_table): + """根据源节点采样指定数量的邻居节点,注意使用的是有放回的采样; + 某个节点的邻居节点数量少于采样数量时,采样结果出现重复的节点 + + Arguments: + src_nodes {list, ndarray} -- 源节点列表 + sample_num {int} -- 需要采样的节点数 + neighbor_table {dict} -- 节点到其邻居节点的映射表 + + Returns: + np.ndarray -- 采样结果构成的列表 + """ + results = [] + for sid in src_nodes: + # 从节点的邻居中进行有放回地进行采样 + res = np.random.choice(neighbor_table[sid], size=(sample_num, )) + results.append(res) + return np.asarray(results).flatten() + +def multihop_sampling(src_nodes, sample_nums, neighbor_table): + """根据源节点进行多阶采样 + + Arguments: + src_nodes {list, np.ndarray} -- 源节点id + sample_nums {list of int} -- 每一阶需要采样的个数 + neighbor_table {dict} -- 节点到其邻居节点的映射 + + Returns: + [list of ndarray] -- 每一阶采样的结果 + """ + sampling_result = [src_nodes] + for k, hopk_num in enumerate(sample_nums): + hopk_result = sampling(sampling_result[k], hopk_num, neighbor_table) + sampling_result.append(hopk_result) + return sampling_result + +#聚合 +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init + +class NeighborAggregator(nn.Module): + def __init__(self, input_dim, output_dim, + use_bias=False, aggr_method="mean"): + """聚合节点邻居 + Args: + input_dim: 输入特征的维度 + output_dim: 输出特征的维度 + use_bias: 是否使用偏置 (default: {False}) + aggr_method: 邻居聚合方式 (default: {mean}) + """ + super(NeighborAggregator, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.use_bias = use_bias + self.aggr_method = aggr_method + self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim)) + if self.use_bias: + self.bias = nn.Parameter(torch.Tensor(self.output_dim)) + self.reset_parameters() + + def reset_parameters(self): + init.kaiming_uniform_(self.weight) + if self.use_bias: + init.zeros_(self.bias) + + def forward(self, neighbor_feature): + if self.aggr_method == "mean": + aggr_neighbor = neighbor_feature.mean(dim=1) + elif self.aggr_method == "sum": + aggr_neighbor = neighbor_feature.sum(dim=1) + elif self.aggr_method == "max": + aggr_neighbor = neighbor_feature.max(dim=1) + else: + raise ValueError("Unknown aggr type, expected sum, max, or mean, but got {}" + .format(self.aggr_method)) + + neighbor_hidden = torch.matmul(aggr_neighbor, self.weight) + if self.use_bias: + neighbor_hidden += self.bias + + return neighbor_hidden + + def extra_repr(self): + return 'in_features={}, out_features={}, aggr_method={}'.format( + self.input_dim, self.output_dim, self.aggr_method) + +class SageGCN(nn.Module): + def __init__(self, input_dim, hidden_dim, + activation=F.relu, + aggr_neighbor_method="mean", + aggr_hidden_method="sum"): + """SageGCN层定义 + Args: + input_dim: 输入特征的维度 + hidden_dim: 隐层特征的维度, + 当aggr_hidden_method=sum, 输出维度为hidden_dim + 当aggr_hidden_method=concat, 输出维度为hidden_dim*2 + activation: 激活函数 + aggr_neighbor_method: 邻居特征聚合方法,["mean", "sum", "max"] + aggr_hidden_method: 节点特征的更新方法,["sum", "concat"] + """ + super(SageGCN, self).__init__() + assert aggr_neighbor_method in ["mean", "sum", "max"] + assert aggr_hidden_method in ["sum", "concat"] + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.aggr_neighbor_method = aggr_neighbor_method + self.aggr_hidden_method = aggr_hidden_method + self.activation = activation + self.aggregator = NeighborAggregator(input_dim, hidden_dim, + aggr_method=aggr_neighbor_method) + self.b = nn.Parameter(torch.Tensor(input_dim, hidden_dim)) + self.reset_parameters() + + def reset_parameters(self): + init.kaiming_uniform_(self.b) + + def forward(self, src_node_features, neighbor_node_features): + neighbor_hidden = self.aggregator(neighbor_node_features) + self_hidden = torch.matmul(src_node_features, self.b) + + if self.aggr_hidden_method == "sum": + hidden = self_hidden + neighbor_hidden + elif self.aggr_hidden_method == "concat": + hidden = torch.cat([self_hidden, neighbor_hidden], dim=1) + else: + raise ValueError("Expected sum or concat, got {}" + .format(self.aggr_hidden)) + if self.activation: + return self.activation(hidden) + else: + return hidden + + def extra_repr(self): + output_dim = self.hidden_dim if self.aggr_hidden_method == "sum" else self.hidden_dim * 2 + return 'in_features={}, out_features={}, aggr_hidden_method={}'.format( + self.input_dim, output_dim, self.aggr_hidden_method) + +class GraphSage(nn.Module): + def __init__(self, input_dim, hidden_dim, + num_neighbors_list): + super(GraphSage, self).__init__() + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.num_neighbors_list = num_neighbors_list + self.num_layers = len(num_neighbors_list) + self.gcn = nn.ModuleList() + self.gcn.append(SageGCN(input_dim, hidden_dim[0])) + for index in range(0, len(hidden_dim) - 2): + self.gcn.append(SageGCN(hidden_dim[index], hidden_dim[index+1])) + self.gcn.append(SageGCN(hidden_dim[-2], hidden_dim[-1], activation=None)) + + def forward(self, node_features_list): + hidden = node_features_list + for l in range(self.num_layers): + next_hidden = [] + gcn = self.gcn[l] + for hop in range(self.num_layers - l): + src_node_features = hidden[hop] + src_node_num = len(src_node_features) + neighbor_node_features = hidden[hop + 1] \ + .view((src_node_num, self.num_neighbors_list[hop], -1)) + h = gcn(src_node_features, neighbor_node_features) + next_hidden.append(h) + hidden = next_hidden + return hidden[0] + + def extra_repr(self): + return 'in_features={}, num_neighbors_list={}'.format( + self.input_dim, self.num_neighbors_list + ) + +#数据处理 +import os +import os.path as osp +import pickle +import itertools +import scipy.sparse as sp +import urllib +from collections import namedtuple +import numpy as np + +Data = namedtuple('Data',['x','y','adjacency_dict','train_mask','val_mask','test_mask']) + +class CoraData(object): + download_url = "https://github.com/kimiyoung/planetoid/raw/master/data" + filenames = ["ind.cora.{}".format(name) for name in ['x','tx','allx','y','ty','ally','graph','test.index']] + + def __init__(self,data_root="cora",rebuild=False): + """Cora数据,包括数据下载,处理,加载等功能 + 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘 + 处理之后的数据可以通过属性.data获得,它将返回一个数据对象,包括如下几部分: + * x:节点的特征,维度为2708*1433,类型为np.ndarray + * y:节点的标签,总共包括7个类别,类型为np.ndarray + * adjacency_dict:邻接信息,类型为dict + * train_mask:训练集掩码向量,维度为2708,当节点属于训练集时,相应位置为True,否则False + * val_mask:验证集掩码向量,维度为2708,当节点属于验证集时,相应位置为True,否则False + * test_mask: 测试集掩码向量,维度为2708,当节点属于测试集时,相应位置为True,否则False + + Args: + ------ + data_root:string, optional + 存放数据的目录,原始数据路径:{data_root}/raw + 缓存数据路径:{data_root}/processed_cora.pkl + rebuild:boolean,optional + 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据 + """ + self.data_root=data_root + save_file=osp.join(self.data_root,"processed_cora.pkl") + if osp.exists(save_file) and not rebuild: + print("Using Cached file: {}".format(save_file)) + self._data = pickle.load(open(save_file,"rb")) + else: + self.maybe_download() + self._data=self.process_data() + with open(save_file,"wb") as f: + pickle.dump(self.data,f) + print("Cached file: {}".format(save_file)) + + @property + def data(self): + """返回Data数据对象,包括x, y, adjacency, train_mask, val_mask, test_mask""" + return self._data + def process_data(self): + """ + 处理数据,得到节点特征和标签,邻接矩阵,训练集、验证集以及测试集 + 引用自:https://github.com/rusty1s/pytorch_geometric + """ + print("Process data ...") + _,tx,allx,y,ty,ally,graph,test_index=[self.read_data( + osp.join(self.data_root,"raw",name)) for name in self.filenames] + train_index = np.arange(y.shape[0]) + val_index = np.arange(y.shape[0],y.shape[0]+500) + sorted_test_index = sorted(test_index) + + x=np.concatenate((allx,tx),axis=0) + y=np.concatenate((ally,ty),axis=0).argmax(axis=1) + + x[test_index] = x[sorted_test_index] + y[test_index]= y[sorted_test_index] + num_nodes=x.shape[0] + + train_mask=np.zeros(num_nodes,dtype=np.bool) + val_mask=np.zeros(num_nodes,dtype=np.bool) + test_mask = np.zeros(num_nodes, dtype=np.bool) + train_mask[train_index] = True + val_mask[val_index] = True + test_mask[test_index] = True + adjacency_dict=graph + print("Node's feature shape:",x.shape) + print("Node's label shape: ", y.shape) + print("Adjacency's shape: ", len(adjacency_dict)) + print("Number of training nodes: ", train_mask.sum()) + print("Number of validation nodes: ", val_mask.sum()) + print("Number of test nodes: ", test_mask.sum()) + return Data(x=x, y=y, adjacency_dict=adjacency_dict, + train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) + + def maybe_download(self): + save_path = os.path.join(self.data_root, "raw") + for name in self.filenames: + if not osp.exists(osp.join(save_path, name)): + self.download_data("{}/{}".format(self.download_url, name), save_path) + + @staticmethod + def build_adjacency(adj_dict): + """根据邻接表创建邻接矩阵""" + edge_index = [] + num_nodes = len(adj_dict) + for src, dst in adj_dict.items(): + edge_index.extend([src, v] for v in dst) + edge_index.extend([v, src] for v in dst) + # 去除重复的边 + edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index))) + edge_index = np.asarray(edge_index) + adjacency = sp.coo_matrix((np.ones(len(edge_index)),(edge_index[:, 0], edge_index[:, 1])), + shape=(num_nodes, num_nodes), dtype="float32") + return adjacency + + @staticmethod + def read_data(path): + """使用不同的方式读取原始数据以进一步处理""" + name = osp.basename(path) + if name == "ind.cora.test.index": + out = np.genfromtxt(path, dtype="int64") + return out + else: + out = pickle.load(open(path, "rb"), encoding="latin1") + out = out.toarray() if hasattr(out, "toarray") else out + return out + + @staticmethod + def download_data(url, save_path): + """数据下载工具,当原始数据不存在时将会进行下载""" + # print(save_path) + if not os.path.exists(save_path): + os.makedirs(save_path) + data = urllib.request.urlopen(url) + filename = os.path.split(url)[-1] + + with open(os.path.join(save_path, filename), 'wb') as f: + f.write(data.read()) + # print(os.path.join(save_path, filename)) + return True + +# data = CoraData().data + +#主函数 +import torch + +import numpy as np +import torch.nn as nn +import torch.optim as optim +from collections import namedtuple + +#数据准备 +Data = namedtuple('Data', ['x', 'y', 'adjacency_dict','train_mask', 'val_mask', 'test_mask']) + +data = CoraData().data +x = data.x / data.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1 + +train_index = np.where(data.train_mask)[0] +train_label = data.y[train_index] +test_index = np.where(data.test_mask)[0] + +#模型初始化 +INPUT_DIM = 1433 # 输入维度 +# Note: 采样的邻居阶数需要与GCN的层数保持一致 +HIDDEN_DIM = [128, 7] # 隐藏单元节点数 +NUM_NEIGHBORS_LIST = [10, 10] # 每阶采样邻居的节点数 +assert len(HIDDEN_DIM) == len(NUM_NEIGHBORS_LIST) +BTACH_SIZE = 16 # 批处理大小 +EPOCHS = 20 +NUM_BATCH_PER_EPOCH = 20 # 每个epoch循环的批次数 +LEARNING_RATE = 0.01 # 学习率 +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +model = GraphSage(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, + num_neighbors_list=NUM_NEIGHBORS_LIST).to(DEVICE) +print(model) +criterion = nn.CrossEntropyLoss().to(DEVICE) +optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=5e-4) + +#模型训练和测试 +def train(): + model.train() + for e in range(EPOCHS): + for batch in range(NUM_BATCH_PER_EPOCH): + batch_src_index = np.random.choice(train_index, size=(BTACH_SIZE,)) + batch_src_label = torch.from_numpy(train_label[batch_src_index]).long().to(DEVICE) + batch_sampling_result = multihop_sampling(batch_src_index, NUM_NEIGHBORS_LIST, data.adjacency_dict) + batch_sampling_x = [torch.from_numpy(x[idx]).float().to(DEVICE) for idx in batch_sampling_result] + batch_train_logits = model(batch_sampling_x) + loss = criterion(batch_train_logits, batch_src_label) + optimizer.zero_grad() + loss.backward() # 反向传播计算参数的梯度 + optimizer.step() # 使用优化方法进行梯度更新 + print("Epoch {:03d} Batch {:03d} Loss: {:.4f}".format(e, batch, loss.item())) + test() + + +def test(): + model.eval() + with torch.no_grad(): + test_sampling_result = multihop_sampling(test_index, NUM_NEIGHBORS_LIST, data.adjacency_dict) + test_x = [torch.from_numpy(x[idx]).float().to(DEVICE) for idx in test_sampling_result] + test_logits = model(test_x) + test_label = torch.from_numpy(data.y[test_index]).long().to(DEVICE) + predict_y = test_logits.max(1)[1] + accuarcy = torch.eq(predict_y, test_label).float().mean().item() + print("Test Accuracy: ", accuarcy) + +train()
\ No newline at end of file diff --git a/code/hunterGraph.py b/code/hunterGraph.py new file mode 100644 index 0000000..6332552 --- /dev/null +++ b/code/hunterGraph.py @@ -0,0 +1,286 @@ +import networkx as nx +import csv + +def show_connected_subgraphs(edge_file): + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + largest = max(nx.connected_components(G),key=len) + largest_connected_subgraph = G.subgraph(largest) + node_num_list=[] + edge_num_list=[] + + for c in sorted(nx.connected_components(G),key=len,reverse=True): + subgraph=G.subgraph(c) + node_num_list.append(nx.number_of_nodes(subgraph)) + edge_num_list.append(nx.number_of_edges(subgraph)) + # with open("subgraph_edges.txt", 'a+', encoding="utf-8") as f: + # f.write(str(subgraph.edges)+"\n") + + + import matplotlib.pyplot as plt + import numpy as np + + x = np.array(node_num_list) + y = np.array(edge_num_list) + + plt.xlabel("nodes") + plt.ylabel("edges") + plt.scatter(x, y) + plt.show() + +def node_type_count(node_file): + node_count_dict={} + with open(node_file, 'r', encoding="utf-8") as nodefile: + nodes = csv.DictReader(nodefile) + for node in nodes: + if node["type"] in node_count_dict: + node_count_dict[node["type"]]+=1 + else: + node_count_dict[node["type"]]=1 + print(node_count_dict) + +import pandas as pd +def benign_fraud_count(node_file,subgraph_node_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + fraud_count=0 + benign_count=0 + domain_count=0 + IP_count=0 + x_mailer_count=0 + with open(subgraph_node_file, 'r', encoding="utf-8") as f: + line=f.readline().strip() + line=line.replace("['","") + line=line.replace("']","") + sub_nodes=line.split("', '") + # print(nodes['index'].dtypes) + for sub_node in sub_nodes: + # print(type(sub_node)) + node_index=nodes[(nodes['index']==int(sub_node))].index.tolist()[0] + node_name=nodes.at[node_index,'name'] + node_type = nodes.at[node_index, 'type'] + if node_type==0: + node_num=node_name[5:] + if int(node_num) <= 6550 and int(node_num) >=6264: + fraud_count+=1 + else: + benign_count+=1 + else: + + if node_type==1 : + domain_count+=1 + elif node_type==2: + IP_count+=1 + else: + x_mailer_count+=1 + + print("fraud: "+str(fraud_count)) + print("benign: "+str(benign_count)) + print("domain: "+str(domain_count)) + print("IP: "+str(IP_count)) + print("x-mailer: "+ str(x_mailer_count)) + # node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0] + # edge_list.append(str(node1_index)+","+str(node2_index)) + +def merge_meta_path(edge_file,meta_path_file): + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + # largest = max(nx.connected_components(G),key=len) + # largest_connected_subgraph = G.subgraph(largest) + subgraph_edges=list(G.edges) + # print(subgraph_edges) + meta_path_list=[] + for i in range(0,len(subgraph_edges)): + for j in range(0,len(subgraph_edges)): + if subgraph_edges[i][1] == subgraph_edges[j][0]: + meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][1])) + elif subgraph_edges[i][1] == subgraph_edges[j][1]: + meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][0])) + print(meta_path_list) + with open(meta_path_file, 'w', encoding="utf-8") as f: + f.write("node1,path,node2\n") + for meta_path in meta_path_list: + f.write(meta_path[0]+","+meta_path[1]+","+meta_path[2]+"\n") + +def new_index_to_subgraph(edge_file,subgraph_index_file,node_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + # largest = max(nx.connected_components(G),key=len) + # largest_connected_subgraph = G.subgraph(largest) + subgraph_edges=list(G.edges) + with open(subgraph_index_file,'w',encoding='utf-8') as new_index_file: + new_index_file.write("oldIndex,newIndex,label\n") + index=1 + new_node_dict={} + for edge in subgraph_edges: + node_index = nodes[(nodes['index'] == int(edge[0]))].index.tolist()[0] + node_name = nodes.at[node_index, 'name'] + node_type = nodes.at[node_index,'type'] + if node_type == 0: + if node_name[5:] in new_node_dict: + continue + else: + if int(node_name[5:])<2010: + new_node_dict[node_name[5:]]=str(index) + new_index_file.write(edge[0]+","+str(index)+",1\n") + index+=1 + else: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[0] + "," + str(index) + ",0\n") + index+=1 + node_index = nodes[(nodes['index'] == int(edge[1]))].index.tolist()[0] + node_name = nodes.at[node_index, 'name'] + node_type = nodes.at[node_index, 'type'] + if node_type == 0: + if node_name[5:] in new_node_dict: + continue + else: + if int(node_name[5:]) < 2010: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[1] + "," + str(index) + ",1\n") + index += 1 + else: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[1] + "," + str(index) + ",0\n") + index += 1 + + +def split_meta_path(node_file,meta_path_file,index_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + indexes = pd.read_csv(index_file,encoding='utf-8') + EDE_list=[] + EIE_list=[] + EXE_list=[] + with open(meta_path_file, 'r', encoding="utf-8") as f: + paths=csv.DictReader(f) + for path in paths: + node_index=nodes[(nodes['index']==int(path['path']))].index.tolist()[0] + node_type = nodes.at[node_index, 'type'] + if node_type == 1: + EDE_list.append((path['node1'],path['node2'])) + elif node_type == 2: + EIE_list.append((path['node1'],path['node2'])) + elif node_type == 3: + EXE_list.append((path['node1'],path['node2'])) + with open("EDE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for ede in EDE_list: + # node1_name = nodes.at[int(ede[0]),'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(ede[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index,'newIndex'] + # node2_name = nodes.at[int(ede[1]),'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(ede[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num)+","+str(node2_num)+"\n") + with open("EIE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for eie in EIE_list: + # node1_name = nodes.at[int(eie[0]), 'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(eie[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index, 'newIndex'] + # node2_name = nodes.at[int(eie[1]), 'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(eie[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num)+","+str(node2_num)+"\n") + with open("EXE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for exe in EXE_list: + # node1_name = nodes.at[int(exe[0]), 'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(exe[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index, 'newIndex'] + # node2_name = nodes.at[int(exe[1]), 'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(exe[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num) + "," + str(node2_num) + "\n") + +import numpy as np +def meta_path_to_matrix(meta_path_file): + num = [[0 for i in range(0, 6975)] for j in range(0, 6975)] + with open(meta_path_file, 'r') as f: + cols = csv.DictReader(f) + for col in cols: + num[int(col["eml1"])-1][int(col["eml2"])-1]=1 + num[int(col["eml2"])-1][int(col["eml1"])-1] = 1 + for i in range(0,6975): + num[i][i] = 1 + arr = np.array(num) + return arr + +def extract_label(label_file): + num = [[0 for i in range(0, 2)] for j in range(0, 6975)] + with open(label_file, 'r') as f: + cols = csv.DictReader(f) + for col in cols: + if int(col["label"]) == 1: + num[int(col["newIndex"])-1][0] = 1 + elif int(col["label"]) ==0: + num[int(col["newIndex"])-1][1] =1 + arr = np.array(num) + return arr + +import random +def generate_features(): + features = [[0 for i in range(0, 8)] for j in range(0, 6975)] + for i in range(0,6975): + length=random.randint(1,8) + for j in range(0,length): + loc = random.randint(0,7) + features[i][loc]=1 + features = np.array(features) + return features + +from scipy.io import savemat +def save_data(EDE_file,EIE_file,EXE_file,label_file,mat_file): + shuffled_index = np.random.permutation(6975) + split_index1 = int(6975 * 0.6) + split_index2 = int(6975*0.8) + train_index = shuffled_index[:split_index1] + train_idx = np.array([train_index]) + val_index = shuffled_index[split_index1:split_index2] + val_idx = np.array([val_index]) + test_index = shuffled_index[split_index2:] + test_idx = np.array([test_index]) + label = extract_label(label_file) + EDE = meta_path_to_matrix(EDE_file) + EIE = meta_path_to_matrix(EIE_file) + EXE = meta_path_to_matrix(EXE_file) + features = generate_features() + savemat(mat_file,{'EIE':EIE,'EDE':EDE,'EXE':EXE,'features':features,'label':label,'train_idx':train_idx,'val_idx':val_idx,'test_idx':test_idx}) + + + +if __name__ =="__main__": + # benign_fraud_count("hunter_node.csv","first_subgraph_nodes1.txt") + # node_type_count("hunter_node.csv") + # show_connected_subgraphs("hunter_edge_index_only.csv") + # merge_meta_path("hunter_edge_index_only.csv","meta_path_original.csv") + # split_meta_path("hunter_node.csv","meta_path_original.csv","subgraph_index.csv") + # meta_path_to_matrix("EDE_list.csv") + # new_index_to_subgraph("hunter_edge_index_only.csv","subgraph_index.csv","hunter_node.csv") + # extract_label("subgraph_index.csv") + save_data("EDE_list.csv","EIE_list.csv","EXE_list.csv","subgraph_index.csv","SG_dataset.mat") + # generate_features()
\ No newline at end of file diff --git a/code/parseEml.py b/code/parseEml.py new file mode 100644 index 0000000..a5bd50e --- /dev/null +++ b/code/parseEml.py @@ -0,0 +1,324 @@ +#-*- encoding: gb2312 -*- +import email +import os +import re +import csv +import json + +# def sort_key(received_header): +# received_date = email.utils.parsedate_tz(received_header) +# return received_date + +class parseEml: + + def __init__(self, email_path): + self.email_path=email_path + self.msg=self.get_message() + + # ��ȡ�ļ� + def read_mail(self): + if os.path.exists(self.email_path): + with open(self.email_path) as fp: + for line in fp: + print(line) + else: + print('�ļ�������!') + + # ������Ϣ���� + def get_message(self): + if os.path.exists(self.email_path): + fp = open(self.email_path, 'r', encoding='UTF-8',errors='ignore') + # print(self.email_path) + return email.message_from_file(fp) + else: + print('�ļ�������!') + + # ��ȡ�ʼ���Received + def get_received(self): + if self.msg != None: + received_array=self.msg.get_all('Received') + return received_array + else: + print('msg is empty!') + + def get_message_id(self): + if self.msg != None: + return self.msg.get('Message-ID') + else: + print('msg is empty!') + + def get_from(self): + if self.msg != None: + return self.msg.get('From') + else: + print('msg is empty!') + + def get_to(self): + if self.msg != None: + return self.msg.get('To') + else: + print('msg is empty!') + + def get_x_mailer(self): + if self.msg != None: + return self.msg.get('X-Mailer') + else: + print('msg is empty!') + + def get_from_host_list(self): + if self.msg != None: + from_host_list=[] + received_array=self.msg.get_all('Received') + if received_array == None: + return None + for received in received_array: + received = re.sub(r'\n\s*', " ", received) + # print(received) + if "from" in received and "by" in received: + from_host = received[received.find("from") + 5:received.find("by")].strip() + from_host_list.append(from_host) + # print(from_host) + return from_host_list + else: + print('msg is empty!') + + def get_dkim(self): + if self.msg != None: + return self.msg.get('DKIM-Signature') + else: + print('msg is empty!') + + def get_auth_results(self): + if self.msg != None: + return self.msg.get('Authentication-Results') + else: + print('msg is empty!') + +def list_to_file(alist,file_name): + with open(file_name, 'a+') as f: + for i in alist: + f.write(i + '\n') + +def list_to_csv(alist,file_name,file_num): + with open(file_name, 'a+', newline='') as f: + writer = csv.writer(f) + for i in alist: + writer.writerow([i,file_num]) + +def extract_from_host(folder_path,file_name): + from_host_list=[] + files = os.listdir(folder_path) # �õ��ļ����µ������ļ����� + for file in files: # �����ļ��� + received_array = parseEml(folder_path+"/"+ file).get_received() + for received in received_array: + received = re.sub(r'\n\s*', " ", received) + # print(received) + if "from" in received and "by" in received: + from_host = received[received.find("from") + 5:received.find("by")].strip() + # if from_host=="": + # print(received) + from_host_list.append(from_host) + # print(from_host) + list_to_file(from_host_list,file_name) + +def extract_message_id(folder_path,file_name): + files = os.listdir(folder_path) # �õ��ļ����µ������ļ����� + record_list=[] + for file in files: # �����ļ��� + if file == "duplicate": + continue + file_num = file.replace(".eml", "") + mail=parseEml(folder_path+"/"+ file) + message_id=mail.get_message_id() + sender=mail.get_from() + receiver=mail.get_to() + record={"Message-ID":message_id,"From":sender,"To":receiver,"num:":file_num} + record_list.append(record) + with open(file_name, 'a+', newline='',encoding='utf-8') as f: + json.dump(record_list,f) + +def extract_x_mailer(folder_path,file_name): + files = os.listdir(folder_path) # �õ��ļ����µ������ļ����� + record_list = [] + for file in files: # �����ļ��� + if file == "duplicate": + continue + file_num = file.replace(".eml", "") + mail = parseEml(folder_path + "/" + file) + x_mailer=mail.get_x_mailer() + if x_mailer != None: + message_id=mail.get_message_id() + sender = mail.get_from() + receiver = mail.get_to() + record = {"Message-ID":message_id,"X-Mailer": x_mailer, "From": sender, "To": receiver, "num:": file_num} + record_list.append(record) + with open(file_name, 'a+', newline='', encoding='utf-8') as f: + json.dump(record_list, f) + +def extract_received_x_mailer(folder_path,file_name): + files=os.listdir(folder_path) + for file in files: # �����ļ��� + record = "" + if file == "duplicate": + continue + file_num = file.replace(".eml", "") + mail = parseEml(folder_path + "/" + file) + received_array=mail.get_received() + x_mailer = mail.get_x_mailer() + for received in received_array: + received = re.sub(r'\n\s*', " ", received) + received=received[:received.find(";")] + record+="Received: "+received+"; " + if x_mailer != None: + record+="X-Mailer: "+x_mailer + with open(file_name, 'a+', newline='', encoding='utf-8') as f: + f.writelines(record) + f.writelines("\n") + +def to_count(folder_path): + files=os.listdir(folder_path) + receiver_list=[] + for file in files: # �����ļ��� + if file == "duplicate": + continue + mail=parseEml(folder_path + "/" + file) + receiver=mail.get_to() + receiver_list.append(receiver) + dict = {} + for key in receiver_list: + dict[key] = dict.get(key, 0) + 1 + print(dict) + print(max(set(receiver_list), key=receiver_list.count)) + +def extract_to(folder_path,file_name): + files = os.listdir(folder_path) + receiver_list = [] + for file in files: # �����ļ��� + if file == "duplicate": + continue + mail = parseEml(folder_path + "/" + file) + receiver = mail.get_to() + if receiver != None: + receiver_list.append(receiver) + list_to_file(receiver_list,file_name) + +def read_template(template_file): + f = open(template_file) # ����һ���ļ����� + line = f.readline() # �����ļ��� readline()���� + line_list = [] + while line: + line1 = line.strip() + if line1: + line_list.append(line1) + line = f.readline() + f.close() + return line_list + +def extract_edge(email_path,template_list): + mail=parseEml(email_path) + received_array = mail.get_received() + receiver=mail.get_to() + if receiver: + receiver=receiver.replace("\"","\\\"") + else: + receiver="" + # print(received_array) + edge_list = [] + from_host_list=[] + for received in received_array: + received = re.sub(r'\n\s*', " ", received) + # print(received) + if "from" in received and "by" in received: + from_host = received[received.find("from") + 5:received.find("by")].strip() + for template in template_list: + if "<:*:>" in template: + template = template.replace("<:*:>", ".*") + if "(" in template: + template = template.replace("(", "\(") + if ")" in template: + template=template.replace(")","\)") + if "[" in template: + template=template.replace("[","\[") + if re.match(template, from_host): + from_host=template + break + from_host=from_host.replace("\"","\\\"") + from_host_list.append(from_host) + # print(from_host) + num=len(from_host_list) + if num > 0: + last_received=received_array[0] + last_node=last_received[last_received.find("by")+3:last_received.find("with")].strip() + edge_list.append("\""+last_node+"\" -> \""+receiver+"\"") + edge_list.append("\""+from_host_list[0]+"\" -> \""+last_node+"\"") + if num >=2: + for i in range(0,num-2): + edge_list.append("\""+from_host_list[i+1]+"\" -> \""+from_host_list[i]+"\"") + return edge_list + +def extract_path(email_path,email_num): + mail = parseEml(email_path) + received_array = mail.get_received() + sender=mail.get_from() + receiver = mail.get_to() + from_host_list = [] + path="" + for received in received_array: + received = re.sub(r'\n\s*', " ", received) + # print(received) + if "from" in received and "by" in received: + from_host = received[received.find("from") + 5:received.find("by")].strip() + from_host_list.append(from_host) + # print(from_host) + num = len(from_host_list) + if num > 0: + last_received = received_array[0] + last_node = last_received[last_received.find("by") + 3:last_received.find("with")].strip() + if receiver: + path= last_node + "," + receiver + else: + path=last_node+"," + path= from_host_list[0] + " -> " + path + if num >= 2: + for i in range(1, num): + path=from_host_list[i] + " -> " + path + path=sender+","+path + path=email_num+ ","+ path + return path + +def extract_all_edges(email_folder,template_path,file_name): + template_list = read_template(template_path) + files = os.listdir(email_folder) + for file in files: # �����ļ��� + if file == "duplicate": + continue + mail_path = extract_edge(email_folder + "/" + file,template_list) + with open(file_name, 'a+') as f: + for i in mail_path: + f.write(i + ' [color=red];\n') + +def extract_all_paths(email_folder,file_name): + with open(file_name, 'a+') as f: + f.write('email_num,from,path,receiver\n') + files = os.listdir(email_folder) + for file in files: # �����ļ��� + if file == "duplicate": + continue + mail_path = extract_path(email_folder + "/" + file,file.replace(".eml", "")) + with open(file_name, 'a+') as f: + f.write(mail_path + '\n') + +def delete_once_node(edge_file): + f = open(edge_file) # ����һ���ļ����� + line = f.readline() # �����ļ��� readline()���� + line_list = [] + while line: + line1 = line.strip() + if line1: + line_list.append(line1) + line = f.readline() + f.close() + return line_list + +if __name__=="__main__": + extract_all_paths("nazario_phishing_2021","nazario_paths.csv")
\ No newline at end of file |
