summaryrefslogtreecommitdiff
path: root/dataset_handle/DDoS.py
blob: 5bffc7c6ffd976f840bb943bb0d320ad66f133a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# coding:utf-8
import networkx as nx
import numpy as np
import csv
import random
import json

import matplotlib.pyplot as plt

ip_dict = {}
json_num = 0
ip = "172.16.0.5"
addr = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "DNS", "LDAP", "MSSQL", "NetBIOS", "NTP", "SNMP",
        "SSDP", "SYN", "TFTP", "UDP", "UDP-Lag", "WebDDoS"]
tag = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
pro = ['tcp', 'icmp', 'udp']  # 传输层协议类型
num_label_dict = {i:0 for i in range(14)}

def port_change(row):
    return -1 if row[0] == '' else row[0] \
        if int(row[0]) <= 1024 else row[1] \
        if int(row[1]) <= 1024 else 1025


def save_dataset(i, seed):
    global json_num
    dict_flow = {"edge": [[0, 0]],
                 "features": {"0": 0},
                 "label": 13}
    # 生成ip节点
    if i not in ip_dict:
        ip_dict[i] = json_num
        json_num += 1
        # dict_flow["features"]["0"] = [json_num]
        with open("E:/DDoS/DDoS2019/SEAL/" + str(seed) +"/"
                  + str(ip_dict[i]) + '.json', "w") as f:
            json.dump(dict_flow, f)
        num_label_dict[13] += 1


def csv_dataset(csv_num):
    slow_body1 = csv.reader(open('E:\\DDoS\\DDoS2019\\0112\\morefeature\\'
                                 + addr[csv_num] + '.csv', 'r'))
    # 1:ip_src 2: ip_dst 3: tcp_srcport 4: tcp_dstport 5: udp_srcport 6: udp_dstport
    # 8: frame_len 9: frame_prot(eth:ethernet:ip:tcp:ssh) 10: frame_time 13: icmp.type
    # 16: tcp_ack 19: tcp_len 20: tcp_window_size 22: tcp_flags 23: ip_flags 24: ip_ttl

    flow_dict = {}
    global json_num

    '''
    把ip地址作为一个小图存入json文件,并记录每个ip地址图的编号
    '''
    # dict = {"edge": [],
    #        "features": {"0": []},
    #        "label": 5}
    for row in slow_body1:
        if row[5] == 'Length': continue
        if not row[1]: continue
        if row[3] != '':
            if not row[3].isdigit(): continue
        '''
        if row[1] not in ip_dict:
            ip_dict[row[1]] = json_num
            json_num += 1
            dict["features"]["0"] = [row[1]]
            with open("D:/学习/mesa/毕设/异质图/SEAL-CI-master/input/graph_DoS/"
                      + str(ip_dict[row[1]]) + '.json', "w") as f:
                json.dump(dict, f)
        if row[3] not in ip_dict:
            ip_dict[row[3]] = json_num
            json_num += 1
            dict["features"]["0"] = [row[3]]
            with open("D:/学习/mesa/毕设/异质图/SEAL-CI-master/input/graph_DoS/"
                      + str(ip_dict[row[3]]) + '.json', "w") as f:
                json.dump(dict, f)
        '''

        '''
        存入src_ip, scr_port, dst_ip, dst_port的字典中,并返回,等待下一步处理
        '''
        pro_all = row[9].split(":")
        if len(pro_all) < 4:
            print("less than 4", row)
            continue

        if row[3] == '' and row[5] == '':
            five_tuple = str(row[1]) + '#' + pro_all[3] + '#' + str(row[2])
            five_tuple_reverse = str(row[2]) + '#' + pro_all[3] + '#' + str(row[1])
        elif row[3] == '' and row[5].isdigit():
            five_tuple = str(row[1]) + '#' + str(row[5]) + '#' + pro_all[3] + '#' + str(row[2]) + '#' + str(row[6])
            five_tuple_reverse = str(row[2]) + '#' + str(row[6]) + '#' + pro_all[3] + '#' + str(row[1]) + '#' + str(
                row[5])
        elif row[3].isdigit():
            five_tuple = str(row[1]) + '#' + str(row[3]) + '#' + pro_all[3] + '#' + str(row[2]) + '#' + str(row[4])
            five_tuple_reverse = str(row[2]) + '#' + str(row[4]) + '#' + pro_all[3] + '#' + str(row[1]) + '#' + str(
                row[3])
        else:
            print("other condition", row)
            continue

        tmp_one_packet = [int(row[8]), row[9], row[20], row[18], row[24],
                         port_change(row[5:7])]
        # 包大小、协议、window_size、tcp.flags、ip.ttl、udp端口号
        # tcp端口 port_change(row[3:5]),
        if five_tuple in flow_dict:
            tmp_one_packet[0] *= -1
            flow_dict[five_tuple].append(tmp_one_packet)
        elif five_tuple_reverse in flow_dict:
            flow_dict[five_tuple_reverse].append(tmp_one_packet)
        else:
            flow_dict[five_tuple] = []
            tmp_one_packet[0] *= -1
            flow_dict[five_tuple].append(tmp_one_packet)

    return flow_dict


def load_data():
    global json_num
    global ip_dict
    random_seeds = [1, 2, 4, 7, 9]

    for seed in random_seeds:
        json_num = 0
        ip_dict = {}
        flow_nums = 0
        for csv_num in range(len(addr)):
            # 获得flow的dict
            print(f"addr: {addr[csv_num]}, json_num: {json_num}")
            flows = csv_dataset(csv_num)
            # 随机取其中的20%
            keys = list(flows.keys())
            random.seed(seed)
            random.shuffle(keys)
            flows_sampled = {}

            ratio = 0.01     # *****调节数据集比例
            if len(flows) < (15000 / ratio):
                l = min(len(flows), 15000)
            else:
                l = int(len(flows) * ratio)
            print(f"file_flow_before:{l}")
            for i in range(l):
                flows_sampled[keys[i]] = flows[keys[i]]
            flows = flows_sampled

            for flow_keys, flow_contents in flows.items():
                five_tuple_list = flow_keys.split('#')
                if len(five_tuple_list) == 3:
                    ip_first = five_tuple_list[0]
                    ip_dst = five_tuple_list[2]
                else:
                    ip_first = five_tuple_list[0]
                    ip_dst = five_tuple_list[3]

                if ip_dst == ip:  # 如果主动发起方不是攻击IP地址,则不正常
                    continue
                if ip_first == ip and tag[csv_num] == 0:  # 不计入正常数据集中包含攻击IP地址的流量
                    continue

                # 生成ip节点
                save_dataset(ip_first, seed)
                save_dataset(ip_dst, seed)

                dict_flow = {"edge": [],
                             "features": {},
                             "label": 0}
                # 生成labels
                if ip_first == ip:
                    dict_flow["label"] = tag[csv_num]
                num_label_dict[dict_flow["label"]] += 1
                # 构图前准备
                g = nx.Graph()
                node_first = 0
                node_last = 0
                columns = 1

                for packet_num in range(len(flow_contents)):
                    # 生成features
                    dict_flow["features"][str(packet_num)] = flow_contents[packet_num]

                    # 构图
                    g.add_node(packet_num)
                    if packet_num > 0:  # 构造图的边
                        if flow_contents[packet_num][0] * flow_contents[packet_num - 1][0] > 0:
                            g.add_edge(packet_num - 1, packet_num)
                        else:
                            g.add_edge(packet_num, node_first)
                            columns += 1
                            if columns >= 3:
                                g.add_edge(node_last, packet_num - 1)
                            node_first = packet_num
                            node_last = packet_num - 1
                if packet_num != 0:
                    g.add_edge(node_last, packet_num)

                # 生成edges
                edges = [list(pair) for pair in g.edges()]
                dict_flow["edge"] = edges
                if dict_flow["edge"] == []: dict_flow["edge"] = [[0, 0]]

                # 生成好的dict存为json
                json_file_name = "E:/DDoS/DDoS2019/SEAL/" + str(seed) +"/" + str(json_num) + '.json'
                # "D:/学习/mesa/毕设/研究点二/异质图/SEAL-CI-master/input/graph_DDoS/"
                with open(json_file_name, 'w') as f:
                    json.dump(dict_flow, f)

                # 写入大图中节点的连接
                big_graph_edge = "E:/DDoS/DDoS2019/SEAL/" + str(seed) +"/edges_DDoS.csv"
                # "D:/学习/mesa/毕设/研究点二/异质图/SEAL-CI-master/input/edges_DDoS.csv"
                with open(big_graph_edge, "a+", newline='') as f:
                    f_writer = csv.writer(f)
                    f_writer.writerow([ip_dict[ip_first], json_num])
                    f_writer.writerow([json_num, ip_dict[ip_dst]])
                json_num += 1
                flow_nums += 1
                if flow_nums >= 300:
                    flow_nums = 0
                    ip_dict = {}
        print(num_label_dict)
if __name__ == '__main__':
    load_data()