diff options
Diffstat (limited to 'DDoS2019.py')
| -rw-r--r-- | DDoS2019.py | 566 |
1 files changed, 566 insertions, 0 deletions
diff --git a/DDoS2019.py b/DDoS2019.py new file mode 100644 index 0000000..d27d57c --- /dev/null +++ b/DDoS2019.py @@ -0,0 +1,566 @@ +#coding:utf-8 +import networkx as nx +import numpy as np +import csv +import random +import torch +import matplotlib.pyplot as plt +from sklearn.model_selection import StratifiedKFold +import pickle +from itertools import islice +lable_dict = [] +for i in range(20): + lable_dict.append({}) + +# addr0 = ["DDoS_test/"] +addr0 = ['E:/DDoS/DDoS2019/0112/morefeature'] # 总地址 +# addr0 = ["muti0311\\csv\\", "0112\\multi\\csv\\"] +addr1 = [] +num_class = 13 +#addr0311=["1","2","3","4","5","6","7","LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] +#addr0311=["LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] +addr0112=["1","2","3","4","5","6","7","8","9","10","11","12","DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] +#addr0112=["NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] + +#addr1.append(addr0311) +addr1.append(addr0112) + +class S2VGraph(object): + def __init__(self, g, label, node_tags=None, node_features=None): + ''' + g: a networkx graph + label: an integer graph label + node_tags: a list of integer node tags + node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets + edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor + neighbors: list of neighbors (without self-loop) + ''' + self.label = label + self.g = g + self.node_tags = node_tags + self.neighbors = [] + self.node_features = 0 + self.edge_mat = 0 + + self.max_neighbor = 0 + +def csv_dataset(addr, ip_dst): + #addr = "/home/liyuzhen/dataset/"+addr+".csv" + read = csv.reader(open('%s' % addr, 'r')) + len_file = len(open(addr).readlines()) + print("addrr: %s: %s" % (addr, len_file)) + ip_DDoS = ["172.16.0.5"] + ip_DDoS.append(ip_dst) + ip = {} + num = 0 + flows_norm = [] + flows_DDoS = [] + for row in islice(read, 1, None): + # if(num>100000): + # break + num = num + 1 + tmp_one_packet = [] + if (row[1] == '') or (row[2] == ''): + continue + a = row[1] + ' ' + row[2] + b = row[2] + ' ' + row[1] + if (a not in ip) and (b not in ip): + ip[a] = len(ip) + flows_norm.append([]) + try: + if row[8] == '': + continue + else: + tmp_one_packet.append(int(row[8])) # 包大小0 0 + tmp_one_packet.append(row[9]) # 协议1 1 + if row[5] != '': # 3 2 + if int(row[5]) <= 1024: + tmp_one_packet.append(int(row[5])) + # print(row[7]) + elif int(row[6]) <= 1024: + tmp_one_packet.append(int(row[6])) + # print(row[8]) + else: + tmp_one_packet.append(1025) + # print('no port smaller than 1024:%s,%d,%s'%(addr,num,row)) + else: + tmp_one_packet.append(-1) + + if row[19] != '': + tmp_one_packet.append(int(row[19])) # tcp.window_size15 3 + else: + tmp_one_packet.append(-1) + + if row[21] != '': + tmp_one_packet.append(row[21]) # tcp.flags17 4 + else: + tmp_one_packet.append(-1) + + if row[23] != '': + tmp_one_packet.append(row[23]) # ip.ttl19 5 + else: + tmp_one_packet.append(-1) + except Exception as e: + print(f'error in {addr}, row {num}, erro {e.args}_') + continue + + ''' + if row[3] != '': # 2 + if int(row[3]) < 0: + print('port smaller than 0:%s,%d,%s' % (addr, num, row)) + if int(row[3]) <= 1024: + tmp_one_packet.append(int(row[3])) + elif int(row[4]) <= 1024: + tmp_one_packet.append(int(row[4])) + else: + tmp_one_packet.append(1025) + else: + tmp_one_packet.append(-1) + + + + if row[7] != '': + tmp_one_packet.append(row[7]) # frame.encap_type4 + else: + tmp_one_packet.append(-1) # frame.encap_type + + tmp_one_packet.append(float(row[10])) # 时间5 + if row[10] != '': + tmp_one_packet.append(float(row[10])) # http.time6 + else: + tmp_one_packet.append(-1) + if row[11] != '': + tmp_one_packet.append(int(row[11])) # icmp.len7 + else: + tmp_one_packet.append(-1) + if row[12] != '': + tmp_one_packet.append(row[12]) # icmp.type8 + else: + tmp_one_packet.append(-1) + if row[13] != '': + tmp_one_packet.append(row[13]) # irc.request9 + else: + tmp_one_packet.append(-1) + if row[14] != '': + tmp_one_packet.append(row[14]) # irc.response10 + else: + tmp_one_packet.append(-1) + if row[15] != '': + if row[15] == 0: + tmp_one_packet.append(0) # tcp.ack11 + else: + tmp_one_packet.append(1) + else: + tmp_one_packet.append(-1) + if row[16] != '': + tmp_one_packet.append(row[16]) # tcp.ack_rtt12 + else: + tmp_one_packet.append(-1) + + if row[18] != '': + tmp_one_packet.append(int(row[18])) # tcp.len14 + else: + tmp_one_packet.append(-1) + + if row[20]!='': + tmp_one_packet.append(int(row[20])) # udp.length16 + else: + tmp_one_packet.append(-1) + + if row[22] != '': + tmp_one_packet.append(row[22]) # ip.flags18 + else: + tmp_one_packet.append(-1) + ''' + + if a in ip: + tmp_one_packet[0] *= -1 + flows_norm[ip[a]].append(tmp_one_packet) + elif (a not in ip) and (b not in ip): + print("error: in csv_dataset") + elif b in ip: + flows_norm[ip[b]].append(tmp_one_packet) + a = ip_DDoS[0] + ' ' + ip_DDoS[1] + b = ip_DDoS[1] + ' ' + ip_DDoS[0] + if a in ip: + flows_DDoS.append(flows_norm[ip[a]]) + del flows_norm[ip[a]] + elif b in ip: + flows_DDoS.append(flows_norm[ip[b]]) + del flows_norm[ip[b]] + + flows_DDoS_dict = {} + len_DDoS = 0 + len_norm = 0 + for flows in flows_norm: + len_norm += len(flows) + for flows in flows_DDoS: + len1 = len(flows) + len_DDoS += len1 + k = int(len1 / 300000) + for i in range(k): + flows_DDoS_dict[i] = flows[i*300000: (i+1) * 300000] + if k * 300000 < len1: + flows_DDoS_dict[k] = flows[(k*300000):] + # print(flows_DDoS_dict) + print('len_DDoS: %d' % len_DDoS) + print('len_norm: %d' % len_norm) + return flows_DDoS_dict, flows_norm, len_DDoS, len_norm + + # return flows_DDoS,flows_norm + + +def cons_graph(all_traffic, num_node, tags, len_file, rseed): + g_list = [] + # print("len all_traffic: %s" %len(all_traffic)) + num_graph = int(len(all_traffic) / num_node) + # print("num_graph: %s" %num_graph) + # print(num_graph) + last_graph = len(all_traffic) - num_node*num_graph + for y in range(num_graph): + g = nx.Graph() + node_first = 0 + node_last = 0 + node_tags = [] + # node_features = [] + a = 0 + for z in range(num_node): + traffic = num_node * y + z + # if traffic != 0: + # all_traffic[traffic][5] = float(all_traffic[traffic-1][5]) - float(all_traffic[traffic][5]) + g.add_node(z) + node_tags.append(all_traffic[traffic]) + ''' + node_tags.append([]) + #print(f"len(all_traffic[traffic]): {len(all_traffic[traffic])}") + for i in range(len(all_traffic[traffic])): + if(all_traffic[traffic][i] not in lable_dict[i].keys() ): + lable_dict[i][all_traffic[traffic][i]]=len(lable_dict[i]) + #print(lable_dict) + #print("label dict: %s" %lable_dict) + node_tags[z].append(lable_dict[i][all_traffic[traffic][i]]) + ''' + # node_features.append(all_traffic[traffic][0]) + if z > 0: # 构造图的边 + if all_traffic[traffic][0] * all_traffic[traffic-1][0] > 0: + # print(1) + g.add_edge(z-1, z) + # nx.draw(g,with_labels=True,pos=nx.circular_layout(g)) + # plt.show() + else: + # print(2) + g.add_edge(z, node_first) + a = a+1 + # plt.show() + + if a >= 2: + g.add_edge(node_last, z-1) + # nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g)) + node_first=z + node_last=z-1 + g.add_edge(z, node_last) + g_list.append(S2VGraph(g, tags, node_tags)) + ''' + 以下是一些平衡训练集和按比例取测试集的结果,但实际运行起来有训练集和测试集的准确率差太多的问题 + ''' + random.seed(rseed) + random.shuffle(g_list) + len_test = int(len(g_list) * 0.02) + if (len_test < 30) and tags != 0: + len_test = 30 + if (len_test < 1) and tags == 0: + len_test = 1 + + g_test = g_list[0: len_test] + + dataset_num = 20000 + if (tags != 0) and (len_file >= (30 * dataset_num + len_test * 30)): + len_train = len_test + int((len(all_traffic) / float(len_file)) * dataset_num) + g_train = g_list[len_test: len_train] + else: + g_train = g_list[len_test: dataset_num + len_test] + print(f"cons graph sum: {len(g_list)}") + print(f"cons graph train: {len(g_train)}") + print(f"cons graph test: {len(g_test)}") + ''' + if(tags!=0) and (len(g_list) >= (10000+len_test)): + len_train=len_test+10000 + g_train=g_list[len_test:len_train] + else: + g_train=g_list[len_test:len(g_list)] + ''' + del g_list + return g_test, g_train + + #return g_list + +def load_data(degree_as_tag,para): + ''' + dataset: name of dataset + test_proportion: ratio of test train split + seed: random seed for random splitting of dataset + ''' + + print('loading data') + # g_list = [] + + all_csv_dict = {} + # tag1=[0,0,0,0,0,0,0,"LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] + # tag1=[0,0,0,0,0,0,0,1,2,3,4,5,6,7] + # tag1=[1,2,3,4,5,6,7] + # tag2=[0,0,0,0,0,0,0,0,0,0,0,"DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] + tag2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + tag = [] + # tag.append(tag1) + tag.append(tag2) + # ip_dst = ["192.168.50.4"] + ip_dst = ["192.168.50.1"] + # ip_dst = ["192.168.50.4","192.168.50.1"] + tag_flows = {} + class_flow = 13 + # num_node = para['num_node'] + num_node = 30 + random_seeds = [2, 4, 1, 7, 9] + i = 0 # root address + for random_seed in random_seeds: + for j in range(len(addr1[i])): + # g_list = [] + g_tralist = [] + g_telist = [] + # graph_path = "/home/liyuzhen/dataset/" + addr0[i] + '/' + 'graph_lessprotocol/'+ addr1[i][j] + '.pkl' + tra_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'train/' + addr1[i][j] + '.pkl' + te_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'test/' + addr1[i][j] + '.pkl' + # graph = open(graph_path,"wb") + train = open(tra_path, 'wb') + test = open(te_path, 'wb') + addr = addr0[i] + '/' + addr1[i][j] + '.csv' + flows_DDoS, flows_norm, le_DDoS, le_norm = csv_dataset(addr, ip_dst[i]) + # load一个文件建一次图 + if tag[i][j] != 0: + for k in flows_DDoS.values(): + #for k in flows_DDoS: + #print("flow DDoS %s :%s" % (addr1[i][j],len(k))) + + g_test, g_train = cons_graph(k, num_node, tag[i][j], le_DDoS, random_seed) + g_tralist.extend(g_train) + g_telist.extend(g_test) + + #g_Dlist.extend(cons_graph(k,num_node,tag[i][j],len_file)) + #print("graph train DDoS %s :%s" % (addr1[i][j],len(g_tralist))) + #pri`nt("graph test DDoS %s :%s" % (addr1[i][j],len(g_telist))) + #print("graph DDoS %s :%s" % (addr1[i][j],len(g_Dlist))) + #print("flow_DDoS:%s" %len(g_list)) + else: + for m in flows_norm: + #print("flow norm %s :%s" % (addr1[i][j],len(m))) + + g_test, g_train = cons_graph(m, num_node, 0, le_norm, random_seed) + g_tralist.extend(g_train) + g_telist.extend(g_test) + + #print("graph train norm %s :%s" % (addr1[i][j],len(g_tralist))) + #print("graph test norm %s :%s" % (addr1[i][j],len(g_telist))) + ''' + li=cons_graph(m,num_node,0,len_file) + if(li != None): + g_Nlist.extend(li) + #print("graph normal %s :%s" % (addr1[i][j],len(g_Nlist))) + #print("flow_norm:%s" %len(g_list)) + ''' + ''' + random.seed(1) + random.shuffle(g_Dlist) + random.shuffle(g_Nlist) + g_telist=g_Dlist[0:int(len(g_Dlist)*0.01)] + g_telist.extend(g_Nlist[0:int(len(g_Nlist)*0.01)]) + g_tralist=g_Nlist[int(len(g_Nlist)*0.01):] + if(len(g_tralist)<=10000): + g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):]) + else: + g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):int(len(g_Dlist)*0.01)+10000]) + print(len(g_telist)) + print(len(g_tralist)) + #pickle.dump(g_list,graph) + ''' + print(f"{tra_path}: {len(g_tralist)}") + print(f"{te_path}: {len(g_telist)}") + pickle.dump(g_tralist, train) + pickle.dump(g_telist, test) + train.close() + test.close() + #print(label_dict)) + +def load_data2(degree_as_tag, feature, path): + tagset = [] + g_train = [] + g_test = [] + # m=[100,100,100,100,100,100,100,100,100,100,600,100,100,600] + # m=[200,200,200,200,200,100,0,0,0,100,0,0,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,813,244] + for i in range(len(addr0)): + for j in range(len(addr1[i])): + graph_path = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/train/' + addr1[i][j] + '.pkl' + graph_test = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/test/' + addr1[i][j]+'.pkl' + graph = open(graph_path, "rb") + graph_test = open(graph_test, "rb") + g_train.extend(pickle.load(graph)) + g_test.extend(pickle.load(graph_test)) + #print(type(st)) + #print(st[1].node_tags) + #print("每个文件的图数量:%d" %len(st)) + #random.seed(1) + #st=random.sample(st,m[j]) + #st=random.sample(st,int(len(st)*0.1)) + #g_list.extend(st) + #print(len(st)) + graph.close() + graph_test.close() + a = [] + feature = feature.split(",") + for i in range(len(feature)): + if feature[i] != ',': + a.append(int(feature[i])) + print(f"feature: {feature[i]}") + tagset.append({}) + feature = a + tagset, g_train = graph_add_features(g_train, degree_as_tag, feature, tagset) + tagset, g_test = graph_add_features(g_test, degree_as_tag, feature, tagset) + #g_train=one_hot(g_train,'train') + #g_test=one_hot(g_test,'test') + #g_train=feature_bin(g_train) + #g_test=feature_bin(g_test) + g_train = block_bin(g_train, tagset) + g_test = block_bin(g_test, tagset) + print('# maximum node tag: %d' % len(tagset)) + print("# train data: %d" % len(g_train)) + print('#test data: %d' % len(g_test)) + return num_class, g_train, g_test + + +def graph_add_features(g_list, degree_as_tag, feature, tagset): + for g in g_list: + g.neighbors = [[] for i in range(len(g.g))] + for i, j in g.g.edges(): + g.neighbors[i].append(j) + g.neighbors[j].append(i) + degree_list = [] + for i in range(len(g.g)): + g.neighbors[i] = g.neighbors[i] + degree_list.append(len(g.neighbors[i])) + g.max_neighbor = max(degree_list) + + #g.label = lable_dict[g.label] + + edges = [list(pair) for pair in g.g.edges()]#拓展边,无向边 + + edges.extend([[i, j] for j, i in edges]) + deg_list = list(dict(g.g.degree(range(len(g.g)))).values()) + g.edge_mat = torch.LongTensor(edges).transpose(0,1) + if degree_as_tag: + for g in g_list: + g.node_tags = list(dict(g.g.degree).values()) + + #Extracting unique tag labels + #tagset={} + #tagset = set([]) + for g in g_list: + for j in range(len(g.node_tags)): + if '0' in feature: + g.node_tags[j][0]=int(g.node_tags[j][0]/100)*100 + if '5' in feature: + g.node_tags[j][5]=int(g.node[j][5]*100) + + m=[] + for i in range(len(feature)): + #print(f"len(g.node_tags: {len(g.node_tags[j])}") + if(g.node_tags[j][feature[i]] not in tagset[i].keys()): + tagset[i][g.node_tags[j][feature[i]]] = len(tagset[i]) + m.append(tagset[i][g.node_tags[j][feature[i]]]) + g.node_tags[j] = m + #m=m+str(g.node_tags[j][int(feature[i])])+' ' + #g.node_tags[j]=m + #print(m) + #if g.node_tags[j] not in tagset: + #tagset[g.node_tags[j]]=len(tagset) + #g.node_tags[j]=tagset[g.node_tags[j]] + #print(tagset) + #print(g.node_tags) + #tagset = tagset.union(set(g.node_tags[fe])) + #print(tagset) + #print("union_end") + #tagset = list(tagset) + #tag2index = {tagset[i]:i for i in range(len(tagset))} + #num_normal=0 + #num_DDoS=0 + return tagset, g_list + + +def feature_bin(g_list, tagset): + l = len(bin(len(tagset)).replace('0b','')) + for g in g_list: + g.node_features=torch.zeros(len(g.node_tags),l) + for i in range(len(g.node_tags)): + a=str(bin(g.node_tags[i])).replace('0b','') + for j in range(len(a)): + if(a[j]=='1'): + g.node_features[i,l-len(a)+j]=1 + #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags)) + +def block_bin(g_list, tagset): + l=[] + m=0 + for i in range(len(tagset)): + l.append(len(bin(len(tagset[i])).replace('0b',''))) + m += l[i] + #print(tagset) + #print(m) + for g in g_list: + g.node_features=torch.zeros(len(g.node_tags), m) + for j in range(len(g.node_tags)): + for i in range(len(tagset)): + a=str(bin(g.node_tags[j][i])).replace('ob','') + for k in range(len(a)): + if(a[k]=='1'): + if(i != 0): + g.node_features[j,l[i-1]+l[i]-len(a)+k]=1 + else: + g.node_features[j,l[i]-len(a)+k]=1 + #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags)) + return g_list +def one_hot(g_list, ty, tagset): + num_every={} + for i in range(num_class): + num_every[i]=0 + for g in g_list: + num_every[g.label]+=1 + g.node_features = torch.zeros(len(g.node_tags), len(tagset)) + g.node_features[range(len(g.node_tags)), [tagset[tag] for tag in g.node_tags]] = 1 + #print("node_feature") + #print("node_feature end") + #num_DDoS=len(g_list)-num_normal + for i in range(num_class): + print("%s: class %d: %d" %(ty, i, num_every[i])) + #print("num_normal:%d" %num_normal) + #print("num_DDoS:%d" %num_DDoS ) + + #print("label_dict:%s" % lable_dict) + #print('# classes: %d' %class_flow) + return g_list + +def separate_data(graph_list, seed, fold_idx): + assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9." + skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed) + #print(f"skf: {skf}") + labels = [graph.label for graph in graph_list] + idx_list = [] + for idx in skf.split(np.zeros(len(labels)), labels): + idx_list.append(idx) + #print(f"idx:{idx}") + #print(idx_list) + train_idx, test_idx = idx_list[fold_idx] + print(f"train_num: {len(train_idx)} \n test_num: {len(test_idx)}") + + train_graph_list = [graph_list[i] for i in train_idx] + test_graph_list = [graph_list[i] for i in test_idx] + + return train_graph_list, test_graph_list + |
