1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
import re
from parseEml import parseEml
import csv,os
def one_email_to_edges(email_path,email_num):
node_set=set()
edge_list = []
mail = parseEml(email_path)
raw_node_list = mail.get_from_host_list()
if raw_node_list == None:
raw_node_list=[]
raw_node_list.append("email"+str(email_num))
email_node="email"+str(email_num)
node_set.add(email_node+",0")
other_node_set=set()
inter_node_list = []
for node in raw_node_list:
inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node)
# inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
if len(inter_nodes) != 0:
for inter_node in inter_nodes:
if len(re.findall(r'[-a-zA-Z]', inter_node)):
domain_sets = inter_node.split(".")
if len(domain_sets) > 2:
inter_node = ".".join(domain_sets[1:])
node_set.add(inter_node+",1")
other_node_set.add(inter_node)
else:
node_set.add(inter_node+",2")
other_node_set.add(inter_node)
#message-id
message_id = mail.get_message_id()
if message_id != None:
message_id_domain = message_id.split('@')[-1]
message_id_domain = message_id_domain.split(">")[0]
node_set.add(message_id_domain+",1")
other_node_set.add(message_id_domain)
#x-mailer
x_mailer = mail.get_x_mailer()
if x_mailer:
x_mailer = x_mailer.replace("\n", "")
x_mailer = x_mailer.replace(",", "")
if x_mailer != None:
node_set.add(x_mailer+",3")
other_node_set.add(x_mailer)
#dkim-domain
dkim_signature = mail.get_dkim()
if dkim_signature:
dkim_signature = dkim_signature.replace("\n\t", "")
dkim_domains = re.findall(r'd=(.+?);', dkim_signature)
if len(dkim_domains) == 0:
# dkim_domain=dkim_domains[0]
print(dkim_signature)
else:
dkim_domain = dkim_domains[0]
node_set.add(dkim_domain+",1")
other_node_set.add(dkim_domain)
for other_node in other_node_set:
edge_list.append(email_node + "," + other_node)
return node_set,edge_list
def email_batch_to_graph(email_folder,node_file,edge_file):
node_list = set()
with open(node_file, 'r', encoding="utf-8") as csvfile:
nodes = csv.DictReader(csvfile)
for node in nodes:
node_list.add(node["name"]+","+node["type"])
files = os.listdir(email_folder)
email_num=2010
for file in files: # 遍历文件夹
if file == "duplicate":
continue
nodes, edges = one_email_to_edges(email_folder + "/" + file,email_num)
node_list.update(nodes)
with open(edge_file, 'a+', encoding="utf-8") as edge_f:
for edge in edges:
edge_f.write(edge + "\n")
email_num+=1
with open(node_file, 'w', encoding="utf-8") as f:
f.write("index,name,type\n")
i = 0
for node in node_list:
node = str(i) + ',' +node
f.write(node + "\n")
i += 1
import pandas as pd
def nodes_to_index(node_file,edge_file,new_edge_file):
nodes=pd.read_csv(node_file,encoding='utf-8')
edge_list=[]
with open(edge_file, 'r', encoding="utf-8") as edgefile:
edges = csv.DictReader(edgefile)
for edge in edges:
node1_index=nodes[(nodes['name']==edge['node1'])].index.tolist()[0]
node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
edge_list.append(str(node1_index)+","+str(node2_index))
with open(new_edge_file, 'a+', encoding="utf-8") as f:
for new_edge in edge_list:
f.write(new_edge + "\n")
if __name__=="__main__":
# email_batch_to_graph("datacon_1_legitimate","hunter_node.csv","hunter_edge.csv")
nodes_to_index("hunter_node.csv","hunter_edge.csv","hunter_edge_index_only.csv")
|