summaryrefslogtreecommitdiff
path: root/DDoS2019.py
diff options
context:
space:
mode:
Diffstat (limited to 'DDoS2019.py')
-rw-r--r--DDoS2019.py566
1 files changed, 566 insertions, 0 deletions
diff --git a/DDoS2019.py b/DDoS2019.py
new file mode 100644
index 0000000..d27d57c
--- /dev/null
+++ b/DDoS2019.py
@@ -0,0 +1,566 @@
+#coding:utf-8
+import networkx as nx
+import numpy as np
+import csv
+import random
+import torch
+import matplotlib.pyplot as plt
+from sklearn.model_selection import StratifiedKFold
+import pickle
+from itertools import islice
+lable_dict = []
+for i in range(20):
+ lable_dict.append({})
+
+# addr0 = ["DDoS_test/"]
+addr0 = ['E:/DDoS/DDoS2019/0112/morefeature'] # 总地址
+# addr0 = ["muti0311\\csv\\", "0112\\multi\\csv\\"]
+addr1 = []
+num_class = 13
+#addr0311=["1","2","3","4","5","6","7","LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"]
+#addr0311=["LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"]
+addr0112=["1","2","3","4","5","6","7","8","9","10","11","12","DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"]
+#addr0112=["NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"]
+
+#addr1.append(addr0311)
+addr1.append(addr0112)
+
+class S2VGraph(object):
+ def __init__(self, g, label, node_tags=None, node_features=None):
+ '''
+ g: a networkx graph
+ label: an integer graph label
+ node_tags: a list of integer node tags
+ node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
+ edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
+ neighbors: list of neighbors (without self-loop)
+ '''
+ self.label = label
+ self.g = g
+ self.node_tags = node_tags
+ self.neighbors = []
+ self.node_features = 0
+ self.edge_mat = 0
+
+ self.max_neighbor = 0
+
+def csv_dataset(addr, ip_dst):
+ #addr = "/home/liyuzhen/dataset/"+addr+".csv"
+ read = csv.reader(open('%s' % addr, 'r'))
+ len_file = len(open(addr).readlines())
+ print("addrr: %s: %s" % (addr, len_file))
+ ip_DDoS = ["172.16.0.5"]
+ ip_DDoS.append(ip_dst)
+ ip = {}
+ num = 0
+ flows_norm = []
+ flows_DDoS = []
+ for row in islice(read, 1, None):
+ # if(num>100000):
+ # break
+ num = num + 1
+ tmp_one_packet = []
+ if (row[1] == '') or (row[2] == ''):
+ continue
+ a = row[1] + ' ' + row[2]
+ b = row[2] + ' ' + row[1]
+ if (a not in ip) and (b not in ip):
+ ip[a] = len(ip)
+ flows_norm.append([])
+ try:
+ if row[8] == '':
+ continue
+ else:
+ tmp_one_packet.append(int(row[8])) # 包大小0 0
+ tmp_one_packet.append(row[9]) # 协议1 1
+ if row[5] != '': # 3 2
+ if int(row[5]) <= 1024:
+ tmp_one_packet.append(int(row[5]))
+ # print(row[7])
+ elif int(row[6]) <= 1024:
+ tmp_one_packet.append(int(row[6]))
+ # print(row[8])
+ else:
+ tmp_one_packet.append(1025)
+ # print('no port smaller than 1024:%s,%d,%s'%(addr,num,row))
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[19] != '':
+ tmp_one_packet.append(int(row[19])) # tcp.window_size15 3
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[21] != '':
+ tmp_one_packet.append(row[21]) # tcp.flags17 4
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[23] != '':
+ tmp_one_packet.append(row[23]) # ip.ttl19 5
+ else:
+ tmp_one_packet.append(-1)
+ except Exception as e:
+ print(f'error in {addr}, row {num}, erro {e.args}_')
+ continue
+
+ '''
+ if row[3] != '': # 2
+ if int(row[3]) < 0:
+ print('port smaller than 0:%s,%d,%s' % (addr, num, row))
+ if int(row[3]) <= 1024:
+ tmp_one_packet.append(int(row[3]))
+ elif int(row[4]) <= 1024:
+ tmp_one_packet.append(int(row[4]))
+ else:
+ tmp_one_packet.append(1025)
+ else:
+ tmp_one_packet.append(-1)
+
+
+
+ if row[7] != '':
+ tmp_one_packet.append(row[7]) # frame.encap_type4
+ else:
+ tmp_one_packet.append(-1) # frame.encap_type
+
+ tmp_one_packet.append(float(row[10])) # 时间5
+ if row[10] != '':
+ tmp_one_packet.append(float(row[10])) # http.time6
+ else:
+ tmp_one_packet.append(-1)
+ if row[11] != '':
+ tmp_one_packet.append(int(row[11])) # icmp.len7
+ else:
+ tmp_one_packet.append(-1)
+ if row[12] != '':
+ tmp_one_packet.append(row[12]) # icmp.type8
+ else:
+ tmp_one_packet.append(-1)
+ if row[13] != '':
+ tmp_one_packet.append(row[13]) # irc.request9
+ else:
+ tmp_one_packet.append(-1)
+ if row[14] != '':
+ tmp_one_packet.append(row[14]) # irc.response10
+ else:
+ tmp_one_packet.append(-1)
+ if row[15] != '':
+ if row[15] == 0:
+ tmp_one_packet.append(0) # tcp.ack11
+ else:
+ tmp_one_packet.append(1)
+ else:
+ tmp_one_packet.append(-1)
+ if row[16] != '':
+ tmp_one_packet.append(row[16]) # tcp.ack_rtt12
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[18] != '':
+ tmp_one_packet.append(int(row[18])) # tcp.len14
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[20]!='':
+ tmp_one_packet.append(int(row[20])) # udp.length16
+ else:
+ tmp_one_packet.append(-1)
+
+ if row[22] != '':
+ tmp_one_packet.append(row[22]) # ip.flags18
+ else:
+ tmp_one_packet.append(-1)
+ '''
+
+ if a in ip:
+ tmp_one_packet[0] *= -1
+ flows_norm[ip[a]].append(tmp_one_packet)
+ elif (a not in ip) and (b not in ip):
+ print("error: in csv_dataset")
+ elif b in ip:
+ flows_norm[ip[b]].append(tmp_one_packet)
+ a = ip_DDoS[0] + ' ' + ip_DDoS[1]
+ b = ip_DDoS[1] + ' ' + ip_DDoS[0]
+ if a in ip:
+ flows_DDoS.append(flows_norm[ip[a]])
+ del flows_norm[ip[a]]
+ elif b in ip:
+ flows_DDoS.append(flows_norm[ip[b]])
+ del flows_norm[ip[b]]
+
+ flows_DDoS_dict = {}
+ len_DDoS = 0
+ len_norm = 0
+ for flows in flows_norm:
+ len_norm += len(flows)
+ for flows in flows_DDoS:
+ len1 = len(flows)
+ len_DDoS += len1
+ k = int(len1 / 300000)
+ for i in range(k):
+ flows_DDoS_dict[i] = flows[i*300000: (i+1) * 300000]
+ if k * 300000 < len1:
+ flows_DDoS_dict[k] = flows[(k*300000):]
+ # print(flows_DDoS_dict)
+ print('len_DDoS: %d' % len_DDoS)
+ print('len_norm: %d' % len_norm)
+ return flows_DDoS_dict, flows_norm, len_DDoS, len_norm
+
+ # return flows_DDoS,flows_norm
+
+
+def cons_graph(all_traffic, num_node, tags, len_file, rseed):
+ g_list = []
+ # print("len all_traffic: %s" %len(all_traffic))
+ num_graph = int(len(all_traffic) / num_node)
+ # print("num_graph: %s" %num_graph)
+ # print(num_graph)
+ last_graph = len(all_traffic) - num_node*num_graph
+ for y in range(num_graph):
+ g = nx.Graph()
+ node_first = 0
+ node_last = 0
+ node_tags = []
+ # node_features = []
+ a = 0
+ for z in range(num_node):
+ traffic = num_node * y + z
+ # if traffic != 0:
+ # all_traffic[traffic][5] = float(all_traffic[traffic-1][5]) - float(all_traffic[traffic][5])
+ g.add_node(z)
+ node_tags.append(all_traffic[traffic])
+ '''
+ node_tags.append([])
+ #print(f"len(all_traffic[traffic]): {len(all_traffic[traffic])}")
+ for i in range(len(all_traffic[traffic])):
+ if(all_traffic[traffic][i] not in lable_dict[i].keys() ):
+ lable_dict[i][all_traffic[traffic][i]]=len(lable_dict[i])
+ #print(lable_dict)
+ #print("label dict: %s" %lable_dict)
+ node_tags[z].append(lable_dict[i][all_traffic[traffic][i]])
+ '''
+ # node_features.append(all_traffic[traffic][0])
+ if z > 0: # 构造图的边
+ if all_traffic[traffic][0] * all_traffic[traffic-1][0] > 0:
+ # print(1)
+ g.add_edge(z-1, z)
+ # nx.draw(g,with_labels=True,pos=nx.circular_layout(g))
+ # plt.show()
+ else:
+ # print(2)
+ g.add_edge(z, node_first)
+ a = a+1
+ # plt.show()
+
+ if a >= 2:
+ g.add_edge(node_last, z-1)
+ # nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g))
+ node_first=z
+ node_last=z-1
+ g.add_edge(z, node_last)
+ g_list.append(S2VGraph(g, tags, node_tags))
+ '''
+ 以下是一些平衡训练集和按比例取测试集的结果,但实际运行起来有训练集和测试集的准确率差太多的问题
+ '''
+ random.seed(rseed)
+ random.shuffle(g_list)
+ len_test = int(len(g_list) * 0.02)
+ if (len_test < 30) and tags != 0:
+ len_test = 30
+ if (len_test < 1) and tags == 0:
+ len_test = 1
+
+ g_test = g_list[0: len_test]
+
+ dataset_num = 20000
+ if (tags != 0) and (len_file >= (30 * dataset_num + len_test * 30)):
+ len_train = len_test + int((len(all_traffic) / float(len_file)) * dataset_num)
+ g_train = g_list[len_test: len_train]
+ else:
+ g_train = g_list[len_test: dataset_num + len_test]
+ print(f"cons graph sum: {len(g_list)}")
+ print(f"cons graph train: {len(g_train)}")
+ print(f"cons graph test: {len(g_test)}")
+ '''
+ if(tags!=0) and (len(g_list) >= (10000+len_test)):
+ len_train=len_test+10000
+ g_train=g_list[len_test:len_train]
+ else:
+ g_train=g_list[len_test:len(g_list)]
+ '''
+ del g_list
+ return g_test, g_train
+
+ #return g_list
+
+def load_data(degree_as_tag,para):
+ '''
+ dataset: name of dataset
+ test_proportion: ratio of test train split
+ seed: random seed for random splitting of dataset
+ '''
+
+ print('loading data')
+ # g_list = []
+
+ all_csv_dict = {}
+ # tag1=[0,0,0,0,0,0,0,"LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"]
+ # tag1=[0,0,0,0,0,0,0,1,2,3,4,5,6,7]
+ # tag1=[1,2,3,4,5,6,7]
+ # tag2=[0,0,0,0,0,0,0,0,0,0,0,"DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"]
+ tag2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+ tag = []
+ # tag.append(tag1)
+ tag.append(tag2)
+ # ip_dst = ["192.168.50.4"]
+ ip_dst = ["192.168.50.1"]
+ # ip_dst = ["192.168.50.4","192.168.50.1"]
+ tag_flows = {}
+ class_flow = 13
+ # num_node = para['num_node']
+ num_node = 30
+ random_seeds = [2, 4, 1, 7, 9]
+ i = 0 # root address
+ for random_seed in random_seeds:
+ for j in range(len(addr1[i])):
+ # g_list = []
+ g_tralist = []
+ g_telist = []
+ # graph_path = "/home/liyuzhen/dataset/" + addr0[i] + '/' + 'graph_lessprotocol/'+ addr1[i][j] + '.pkl'
+ tra_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'train/' + addr1[i][j] + '.pkl'
+ te_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'test/' + addr1[i][j] + '.pkl'
+ # graph = open(graph_path,"wb")
+ train = open(tra_path, 'wb')
+ test = open(te_path, 'wb')
+ addr = addr0[i] + '/' + addr1[i][j] + '.csv'
+ flows_DDoS, flows_norm, le_DDoS, le_norm = csv_dataset(addr, ip_dst[i])
+ # load一个文件建一次图
+ if tag[i][j] != 0:
+ for k in flows_DDoS.values():
+ #for k in flows_DDoS:
+ #print("flow DDoS %s :%s" % (addr1[i][j],len(k)))
+
+ g_test, g_train = cons_graph(k, num_node, tag[i][j], le_DDoS, random_seed)
+ g_tralist.extend(g_train)
+ g_telist.extend(g_test)
+
+ #g_Dlist.extend(cons_graph(k,num_node,tag[i][j],len_file))
+ #print("graph train DDoS %s :%s" % (addr1[i][j],len(g_tralist)))
+ #pri`nt("graph test DDoS %s :%s" % (addr1[i][j],len(g_telist)))
+ #print("graph DDoS %s :%s" % (addr1[i][j],len(g_Dlist)))
+ #print("flow_DDoS:%s" %len(g_list))
+ else:
+ for m in flows_norm:
+ #print("flow norm %s :%s" % (addr1[i][j],len(m)))
+
+ g_test, g_train = cons_graph(m, num_node, 0, le_norm, random_seed)
+ g_tralist.extend(g_train)
+ g_telist.extend(g_test)
+
+ #print("graph train norm %s :%s" % (addr1[i][j],len(g_tralist)))
+ #print("graph test norm %s :%s" % (addr1[i][j],len(g_telist)))
+ '''
+ li=cons_graph(m,num_node,0,len_file)
+ if(li != None):
+ g_Nlist.extend(li)
+ #print("graph normal %s :%s" % (addr1[i][j],len(g_Nlist)))
+ #print("flow_norm:%s" %len(g_list))
+ '''
+ '''
+ random.seed(1)
+ random.shuffle(g_Dlist)
+ random.shuffle(g_Nlist)
+ g_telist=g_Dlist[0:int(len(g_Dlist)*0.01)]
+ g_telist.extend(g_Nlist[0:int(len(g_Nlist)*0.01)])
+ g_tralist=g_Nlist[int(len(g_Nlist)*0.01):]
+ if(len(g_tralist)<=10000):
+ g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):])
+ else:
+ g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):int(len(g_Dlist)*0.01)+10000])
+ print(len(g_telist))
+ print(len(g_tralist))
+ #pickle.dump(g_list,graph)
+ '''
+ print(f"{tra_path}: {len(g_tralist)}")
+ print(f"{te_path}: {len(g_telist)}")
+ pickle.dump(g_tralist, train)
+ pickle.dump(g_telist, test)
+ train.close()
+ test.close()
+ #print(label_dict))
+
+def load_data2(degree_as_tag, feature, path):
+ tagset = []
+ g_train = []
+ g_test = []
+ # m=[100,100,100,100,100,100,100,100,100,100,600,100,100,600]
+ # m=[200,200,200,200,200,100,0,0,0,100,0,0,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,813,244]
+ for i in range(len(addr0)):
+ for j in range(len(addr1[i])):
+ graph_path = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/train/' + addr1[i][j] + '.pkl'
+ graph_test = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/test/' + addr1[i][j]+'.pkl'
+ graph = open(graph_path, "rb")
+ graph_test = open(graph_test, "rb")
+ g_train.extend(pickle.load(graph))
+ g_test.extend(pickle.load(graph_test))
+ #print(type(st))
+ #print(st[1].node_tags)
+ #print("每个文件的图数量:%d" %len(st))
+ #random.seed(1)
+ #st=random.sample(st,m[j])
+ #st=random.sample(st,int(len(st)*0.1))
+ #g_list.extend(st)
+ #print(len(st))
+ graph.close()
+ graph_test.close()
+ a = []
+ feature = feature.split(",")
+ for i in range(len(feature)):
+ if feature[i] != ',':
+ a.append(int(feature[i]))
+ print(f"feature: {feature[i]}")
+ tagset.append({})
+ feature = a
+ tagset, g_train = graph_add_features(g_train, degree_as_tag, feature, tagset)
+ tagset, g_test = graph_add_features(g_test, degree_as_tag, feature, tagset)
+ #g_train=one_hot(g_train,'train')
+ #g_test=one_hot(g_test,'test')
+ #g_train=feature_bin(g_train)
+ #g_test=feature_bin(g_test)
+ g_train = block_bin(g_train, tagset)
+ g_test = block_bin(g_test, tagset)
+ print('# maximum node tag: %d' % len(tagset))
+ print("# train data: %d" % len(g_train))
+ print('#test data: %d' % len(g_test))
+ return num_class, g_train, g_test
+
+
+def graph_add_features(g_list, degree_as_tag, feature, tagset):
+ for g in g_list:
+ g.neighbors = [[] for i in range(len(g.g))]
+ for i, j in g.g.edges():
+ g.neighbors[i].append(j)
+ g.neighbors[j].append(i)
+ degree_list = []
+ for i in range(len(g.g)):
+ g.neighbors[i] = g.neighbors[i]
+ degree_list.append(len(g.neighbors[i]))
+ g.max_neighbor = max(degree_list)
+
+ #g.label = lable_dict[g.label]
+
+ edges = [list(pair) for pair in g.g.edges()]#拓展边,无向边
+
+ edges.extend([[i, j] for j, i in edges])
+ deg_list = list(dict(g.g.degree(range(len(g.g)))).values())
+ g.edge_mat = torch.LongTensor(edges).transpose(0,1)
+ if degree_as_tag:
+ for g in g_list:
+ g.node_tags = list(dict(g.g.degree).values())
+
+ #Extracting unique tag labels
+ #tagset={}
+ #tagset = set([])
+ for g in g_list:
+ for j in range(len(g.node_tags)):
+ if '0' in feature:
+ g.node_tags[j][0]=int(g.node_tags[j][0]/100)*100
+ if '5' in feature:
+ g.node_tags[j][5]=int(g.node[j][5]*100)
+
+ m=[]
+ for i in range(len(feature)):
+ #print(f"len(g.node_tags: {len(g.node_tags[j])}")
+ if(g.node_tags[j][feature[i]] not in tagset[i].keys()):
+ tagset[i][g.node_tags[j][feature[i]]] = len(tagset[i])
+ m.append(tagset[i][g.node_tags[j][feature[i]]])
+ g.node_tags[j] = m
+ #m=m+str(g.node_tags[j][int(feature[i])])+' '
+ #g.node_tags[j]=m
+ #print(m)
+ #if g.node_tags[j] not in tagset:
+ #tagset[g.node_tags[j]]=len(tagset)
+ #g.node_tags[j]=tagset[g.node_tags[j]]
+ #print(tagset)
+ #print(g.node_tags)
+ #tagset = tagset.union(set(g.node_tags[fe]))
+ #print(tagset)
+ #print("union_end")
+ #tagset = list(tagset)
+ #tag2index = {tagset[i]:i for i in range(len(tagset))}
+ #num_normal=0
+ #num_DDoS=0
+ return tagset, g_list
+
+
+def feature_bin(g_list, tagset):
+ l = len(bin(len(tagset)).replace('0b',''))
+ for g in g_list:
+ g.node_features=torch.zeros(len(g.node_tags),l)
+ for i in range(len(g.node_tags)):
+ a=str(bin(g.node_tags[i])).replace('0b','')
+ for j in range(len(a)):
+ if(a[j]=='1'):
+ g.node_features[i,l-len(a)+j]=1
+ #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags))
+
+def block_bin(g_list, tagset):
+ l=[]
+ m=0
+ for i in range(len(tagset)):
+ l.append(len(bin(len(tagset[i])).replace('0b','')))
+ m += l[i]
+ #print(tagset)
+ #print(m)
+ for g in g_list:
+ g.node_features=torch.zeros(len(g.node_tags), m)
+ for j in range(len(g.node_tags)):
+ for i in range(len(tagset)):
+ a=str(bin(g.node_tags[j][i])).replace('ob','')
+ for k in range(len(a)):
+ if(a[k]=='1'):
+ if(i != 0):
+ g.node_features[j,l[i-1]+l[i]-len(a)+k]=1
+ else:
+ g.node_features[j,l[i]-len(a)+k]=1
+ #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags))
+ return g_list
+def one_hot(g_list, ty, tagset):
+ num_every={}
+ for i in range(num_class):
+ num_every[i]=0
+ for g in g_list:
+ num_every[g.label]+=1
+ g.node_features = torch.zeros(len(g.node_tags), len(tagset))
+ g.node_features[range(len(g.node_tags)), [tagset[tag] for tag in g.node_tags]] = 1
+ #print("node_feature")
+ #print("node_feature end")
+ #num_DDoS=len(g_list)-num_normal
+ for i in range(num_class):
+ print("%s: class %d: %d" %(ty, i, num_every[i]))
+ #print("num_normal:%d" %num_normal)
+ #print("num_DDoS:%d" %num_DDoS )
+
+ #print("label_dict:%s" % lable_dict)
+ #print('# classes: %d' %class_flow)
+ return g_list
+
+def separate_data(graph_list, seed, fold_idx):
+ assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
+ skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed)
+ #print(f"skf: {skf}")
+ labels = [graph.label for graph in graph_list]
+ idx_list = []
+ for idx in skf.split(np.zeros(len(labels)), labels):
+ idx_list.append(idx)
+ #print(f"idx:{idx}")
+ #print(idx_list)
+ train_idx, test_idx = idx_list[fold_idx]
+ print(f"train_num: {len(train_idx)} \n test_num: {len(test_idx)}")
+
+ train_graph_list = [graph_list[i] for i in train_idx]
+ test_graph_list = [graph_list[i] for i in test_idx]
+
+ return train_graph_list, test_graph_list
+