summaryrefslogtreecommitdiff
path: root/code/buildGraph2.py
blob: b994ec84ee0c53dd034505e1dc3eaa388ad63725 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
from parseEml import parseEml
import csv,os

def one_email_to_edges(email_path,email_num):
    node_set=set()
    edge_list = []
    mail = parseEml(email_path)
    raw_node_list = mail.get_from_host_list()
    if raw_node_list == None:
        raw_node_list=[]
    raw_node_list.append("email"+str(email_num))
    email_node="email"+str(email_num)
    node_set.add(email_node+",0")
    other_node_set=set()
    inter_node_list = []
    for node in raw_node_list:
        inter_nodes = re.findall(r'[-a-zA-Z0-9]+\.[\.\w-]+', node)
        # inter_nodes=re.findall(r'\d+(\.\d+){1,}',node)
        if len(inter_nodes) != 0:
            for inter_node in inter_nodes:
                if len(re.findall(r'[-a-zA-Z]', inter_node)):
                    domain_sets = inter_node.split(".")
                    if len(domain_sets) > 2:
                        inter_node = ".".join(domain_sets[1:])
                    node_set.add(inter_node+",1")
                    other_node_set.add(inter_node)
                else:
                    node_set.add(inter_node+",2")
                    other_node_set.add(inter_node)

    #message-id
    message_id = mail.get_message_id()
    if message_id != None:
        message_id_domain = message_id.split('@')[-1]
        message_id_domain = message_id_domain.split(">")[0]
        node_set.add(message_id_domain+",1")
        other_node_set.add(message_id_domain)

    #x-mailer
    x_mailer = mail.get_x_mailer()
    if x_mailer:
        x_mailer = x_mailer.replace("\n", "")
        x_mailer = x_mailer.replace(",", "")
    if x_mailer != None:
        node_set.add(x_mailer+",3")
        other_node_set.add(x_mailer)

    #dkim-domain
    dkim_signature = mail.get_dkim()
    if dkim_signature:
        dkim_signature = dkim_signature.replace("\n\t", "")
        dkim_domains = re.findall(r'd=(.+?);', dkim_signature)
        if len(dkim_domains) == 0:
            # dkim_domain=dkim_domains[0]
            print(dkim_signature)
        else:
            dkim_domain = dkim_domains[0]
            node_set.add(dkim_domain+",1")
            other_node_set.add(dkim_domain)
    for other_node in other_node_set:
        edge_list.append(email_node + "," + other_node)
    return node_set,edge_list

def email_batch_to_graph(email_folder,node_file,edge_file):
    node_list = set()
    with open(node_file, 'r', encoding="utf-8") as csvfile:
        nodes = csv.DictReader(csvfile)
        for node in nodes:
            node_list.add(node["name"]+","+node["type"])
    files = os.listdir(email_folder)
    email_num=2010
    for file in files:  # 遍历文件夹
        if file == "duplicate":
            continue
        nodes, edges = one_email_to_edges(email_folder + "/" + file,email_num)
        node_list.update(nodes)
        with open(edge_file, 'a+', encoding="utf-8") as edge_f:
            for edge in edges:
                edge_f.write(edge + "\n")
        email_num+=1
    with open(node_file, 'w', encoding="utf-8") as f:
        f.write("index,name,type\n")
        i = 0
        for node in node_list:
            node = str(i) + ',' +node
            f.write(node + "\n")
            i += 1

import pandas as pd
def nodes_to_index(node_file,edge_file,new_edge_file):
    nodes=pd.read_csv(node_file,encoding='utf-8')
    edge_list=[]
    with open(edge_file, 'r', encoding="utf-8") as edgefile:
        edges = csv.DictReader(edgefile)
        for edge in edges:
            node1_index=nodes[(nodes['name']==edge['node1'])].index.tolist()[0]
            node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
            edge_list.append(str(node1_index)+","+str(node2_index))
        with open(new_edge_file, 'a+', encoding="utf-8") as f:
            for new_edge in edge_list:
                f.write(new_edge + "\n")

if __name__=="__main__":
    # email_batch_to_graph("datacon_1_legitimate","hunter_node.csv","hunter_edge.csv")
    nodes_to_index("hunter_node.csv","hunter_edge.csv","hunter_edge_index_only.csv")