#coding:utf-8 import networkx as nx import numpy as np import csv import random import torch import matplotlib.pyplot as plt from sklearn.model_selection import StratifiedKFold import pickle from itertools import islice lable_dict = [] for i in range(20): lable_dict.append({}) # addr0 = ["DDoS_test/"] addr0 = ['E:/DDoS/DDoS2019/0112/morefeature'] # 总地址 # addr0 = ["muti0311\\csv\\", "0112\\multi\\csv\\"] addr1 = [] num_class = 13 #addr0311=["1","2","3","4","5","6","7","LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] #addr0311=["LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] addr0112=["1","2","3","4","5","6","7","8","9","10","11","12","DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] #addr0112=["NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] #addr1.append(addr0311) addr1.append(addr0112) class S2VGraph(object): def __init__(self, g, label, node_tags=None, node_features=None): ''' g: a networkx graph label: an integer graph label node_tags: a list of integer node tags node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor neighbors: list of neighbors (without self-loop) ''' self.label = label self.g = g self.node_tags = node_tags self.neighbors = [] self.node_features = 0 self.edge_mat = 0 self.max_neighbor = 0 def csv_dataset(addr, ip_dst): #addr = "/home/liyuzhen/dataset/"+addr+".csv" read = csv.reader(open('%s' % addr, 'r')) len_file = len(open(addr).readlines()) print("addrr: %s: %s" % (addr, len_file)) ip_DDoS = ["172.16.0.5"] ip_DDoS.append(ip_dst) ip = {} num = 0 flows_norm = [] flows_DDoS = [] for row in islice(read, 1, None): # if(num>100000): # break num = num + 1 tmp_one_packet = [] if (row[1] == '') or (row[2] == ''): continue a = row[1] + ' ' + row[2] b = row[2] + ' ' + row[1] if (a not in ip) and (b not in ip): ip[a] = len(ip) flows_norm.append([]) try: if row[8] == '': continue else: tmp_one_packet.append(int(row[8])) # 包大小0 0 tmp_one_packet.append(row[9]) # 协议1 1 if row[5] != '': # 3 2 if int(row[5]) <= 1024: tmp_one_packet.append(int(row[5])) # print(row[7]) elif int(row[6]) <= 1024: tmp_one_packet.append(int(row[6])) # print(row[8]) else: tmp_one_packet.append(1025) # print('no port smaller than 1024:%s,%d,%s'%(addr,num,row)) else: tmp_one_packet.append(-1) if row[19] != '': tmp_one_packet.append(int(row[19])) # tcp.window_size15 3 else: tmp_one_packet.append(-1) if row[21] != '': tmp_one_packet.append(row[21]) # tcp.flags17 4 else: tmp_one_packet.append(-1) if row[23] != '': tmp_one_packet.append(row[23]) # ip.ttl19 5 else: tmp_one_packet.append(-1) except Exception as e: print(f'error in {addr}, row {num}, erro {e.args}_') continue ''' if row[3] != '': # 2 if int(row[3]) < 0: print('port smaller than 0:%s,%d,%s' % (addr, num, row)) if int(row[3]) <= 1024: tmp_one_packet.append(int(row[3])) elif int(row[4]) <= 1024: tmp_one_packet.append(int(row[4])) else: tmp_one_packet.append(1025) else: tmp_one_packet.append(-1) if row[7] != '': tmp_one_packet.append(row[7]) # frame.encap_type4 else: tmp_one_packet.append(-1) # frame.encap_type tmp_one_packet.append(float(row[10])) # 时间5 if row[10] != '': tmp_one_packet.append(float(row[10])) # http.time6 else: tmp_one_packet.append(-1) if row[11] != '': tmp_one_packet.append(int(row[11])) # icmp.len7 else: tmp_one_packet.append(-1) if row[12] != '': tmp_one_packet.append(row[12]) # icmp.type8 else: tmp_one_packet.append(-1) if row[13] != '': tmp_one_packet.append(row[13]) # irc.request9 else: tmp_one_packet.append(-1) if row[14] != '': tmp_one_packet.append(row[14]) # irc.response10 else: tmp_one_packet.append(-1) if row[15] != '': if row[15] == 0: tmp_one_packet.append(0) # tcp.ack11 else: tmp_one_packet.append(1) else: tmp_one_packet.append(-1) if row[16] != '': tmp_one_packet.append(row[16]) # tcp.ack_rtt12 else: tmp_one_packet.append(-1) if row[18] != '': tmp_one_packet.append(int(row[18])) # tcp.len14 else: tmp_one_packet.append(-1) if row[20]!='': tmp_one_packet.append(int(row[20])) # udp.length16 else: tmp_one_packet.append(-1) if row[22] != '': tmp_one_packet.append(row[22]) # ip.flags18 else: tmp_one_packet.append(-1) ''' if a in ip: tmp_one_packet[0] *= -1 flows_norm[ip[a]].append(tmp_one_packet) elif (a not in ip) and (b not in ip): print("error: in csv_dataset") elif b in ip: flows_norm[ip[b]].append(tmp_one_packet) a = ip_DDoS[0] + ' ' + ip_DDoS[1] b = ip_DDoS[1] + ' ' + ip_DDoS[0] if a in ip: flows_DDoS.append(flows_norm[ip[a]]) del flows_norm[ip[a]] elif b in ip: flows_DDoS.append(flows_norm[ip[b]]) del flows_norm[ip[b]] flows_DDoS_dict = {} len_DDoS = 0 len_norm = 0 for flows in flows_norm: len_norm += len(flows) for flows in flows_DDoS: len1 = len(flows) len_DDoS += len1 k = int(len1 / 300000) for i in range(k): flows_DDoS_dict[i] = flows[i*300000: (i+1) * 300000] if k * 300000 < len1: flows_DDoS_dict[k] = flows[(k*300000):] # print(flows_DDoS_dict) print('len_DDoS: %d' % len_DDoS) print('len_norm: %d' % len_norm) return flows_DDoS_dict, flows_norm, len_DDoS, len_norm # return flows_DDoS,flows_norm def cons_graph(all_traffic, num_node, tags, len_file, rseed): g_list = [] # print("len all_traffic: %s" %len(all_traffic)) num_graph = int(len(all_traffic) / num_node) # print("num_graph: %s" %num_graph) # print(num_graph) last_graph = len(all_traffic) - num_node*num_graph for y in range(num_graph): g = nx.Graph() node_first = 0 node_last = 0 node_tags = [] # node_features = [] a = 0 for z in range(num_node): traffic = num_node * y + z # if traffic != 0: # all_traffic[traffic][5] = float(all_traffic[traffic-1][5]) - float(all_traffic[traffic][5]) g.add_node(z) node_tags.append(all_traffic[traffic]) ''' node_tags.append([]) #print(f"len(all_traffic[traffic]): {len(all_traffic[traffic])}") for i in range(len(all_traffic[traffic])): if(all_traffic[traffic][i] not in lable_dict[i].keys() ): lable_dict[i][all_traffic[traffic][i]]=len(lable_dict[i]) #print(lable_dict) #print("label dict: %s" %lable_dict) node_tags[z].append(lable_dict[i][all_traffic[traffic][i]]) ''' # node_features.append(all_traffic[traffic][0]) if z > 0: # 构造图的边 if all_traffic[traffic][0] * all_traffic[traffic-1][0] > 0: # print(1) g.add_edge(z-1, z) # nx.draw(g,with_labels=True,pos=nx.circular_layout(g)) # plt.show() else: # print(2) g.add_edge(z, node_first) a = a+1 # plt.show() if a >= 2: g.add_edge(node_last, z-1) # nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g)) node_first=z node_last=z-1 g.add_edge(z, node_last) g_list.append(S2VGraph(g, tags, node_tags)) ''' 以下是一些平衡训练集和按比例取测试集的结果,但实际运行起来有训练集和测试集的准确率差太多的问题 ''' random.seed(rseed) random.shuffle(g_list) len_test = int(len(g_list) * 0.02) if (len_test < 30) and tags != 0: len_test = 30 if (len_test < 1) and tags == 0: len_test = 1 g_test = g_list[0: len_test] dataset_num = 20000 if (tags != 0) and (len_file >= (30 * dataset_num + len_test * 30)): len_train = len_test + int((len(all_traffic) / float(len_file)) * dataset_num) g_train = g_list[len_test: len_train] else: g_train = g_list[len_test: dataset_num + len_test] print(f"cons graph sum: {len(g_list)}") print(f"cons graph train: {len(g_train)}") print(f"cons graph test: {len(g_test)}") ''' if(tags!=0) and (len(g_list) >= (10000+len_test)): len_train=len_test+10000 g_train=g_list[len_test:len_train] else: g_train=g_list[len_test:len(g_list)] ''' del g_list return g_test, g_train #return g_list def load_data(degree_as_tag,para): ''' dataset: name of dataset test_proportion: ratio of test train split seed: random seed for random splitting of dataset ''' print('loading data') # g_list = [] all_csv_dict = {} # tag1=[0,0,0,0,0,0,0,"LDAP","MSSQL","NetBIOS","PortMap","SYN","UDP","UDP-Lag"] # tag1=[0,0,0,0,0,0,0,1,2,3,4,5,6,7] # tag1=[1,2,3,4,5,6,7] # tag2=[0,0,0,0,0,0,0,0,0,0,0,"DNS","LDAP","MSSQL","NetBIOS","NTP","SNMP","SSDP","SYN","TFTP","UDP","UDP-Lag","WebDDoS"] tag2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] tag = [] # tag.append(tag1) tag.append(tag2) # ip_dst = ["192.168.50.4"] ip_dst = ["192.168.50.1"] # ip_dst = ["192.168.50.4","192.168.50.1"] tag_flows = {} class_flow = 13 # num_node = para['num_node'] num_node = 30 random_seeds = [2, 4, 1, 7, 9] i = 0 # root address for random_seed in random_seeds: for j in range(len(addr1[i])): # g_list = [] g_tralist = [] g_telist = [] # graph_path = "/home/liyuzhen/dataset/" + addr0[i] + '/' + 'graph_lessprotocol/'+ addr1[i][j] + '.pkl' tra_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'train/' + addr1[i][j] + '.pkl' te_path = "E:/DDoS/DDoS2019/pkl/" + str(random_seed) + "/" + 'test/' + addr1[i][j] + '.pkl' # graph = open(graph_path,"wb") train = open(tra_path, 'wb') test = open(te_path, 'wb') addr = addr0[i] + '/' + addr1[i][j] + '.csv' flows_DDoS, flows_norm, le_DDoS, le_norm = csv_dataset(addr, ip_dst[i]) # load一个文件建一次图 if tag[i][j] != 0: for k in flows_DDoS.values(): #for k in flows_DDoS: #print("flow DDoS %s :%s" % (addr1[i][j],len(k))) g_test, g_train = cons_graph(k, num_node, tag[i][j], le_DDoS, random_seed) g_tralist.extend(g_train) g_telist.extend(g_test) #g_Dlist.extend(cons_graph(k,num_node,tag[i][j],len_file)) #print("graph train DDoS %s :%s" % (addr1[i][j],len(g_tralist))) #pri`nt("graph test DDoS %s :%s" % (addr1[i][j],len(g_telist))) #print("graph DDoS %s :%s" % (addr1[i][j],len(g_Dlist))) #print("flow_DDoS:%s" %len(g_list)) else: for m in flows_norm: #print("flow norm %s :%s" % (addr1[i][j],len(m))) g_test, g_train = cons_graph(m, num_node, 0, le_norm, random_seed) g_tralist.extend(g_train) g_telist.extend(g_test) #print("graph train norm %s :%s" % (addr1[i][j],len(g_tralist))) #print("graph test norm %s :%s" % (addr1[i][j],len(g_telist))) ''' li=cons_graph(m,num_node,0,len_file) if(li != None): g_Nlist.extend(li) #print("graph normal %s :%s" % (addr1[i][j],len(g_Nlist))) #print("flow_norm:%s" %len(g_list)) ''' ''' random.seed(1) random.shuffle(g_Dlist) random.shuffle(g_Nlist) g_telist=g_Dlist[0:int(len(g_Dlist)*0.01)] g_telist.extend(g_Nlist[0:int(len(g_Nlist)*0.01)]) g_tralist=g_Nlist[int(len(g_Nlist)*0.01):] if(len(g_tralist)<=10000): g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):]) else: g_tralist.extend(g_Dlist[int(len(g_Dlist)*0.01):int(len(g_Dlist)*0.01)+10000]) print(len(g_telist)) print(len(g_tralist)) #pickle.dump(g_list,graph) ''' print(f"{tra_path}: {len(g_tralist)}") print(f"{te_path}: {len(g_telist)}") pickle.dump(g_tralist, train) pickle.dump(g_telist, test) train.close() test.close() #print(label_dict)) def load_data2(degree_as_tag, feature, path): tagset = [] g_train = [] g_test = [] # m=[100,100,100,100,100,100,100,100,100,100,600,100,100,600] # m=[200,200,200,200,200,100,0,0,0,100,0,0,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,813,244] for i in range(len(addr0)): for j in range(len(addr1[i])): graph_path = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/train/' + addr1[i][j] + '.pkl' graph_test = "E:/DDoS/DDoS2019/0.01#10000pkl/" + str(path) + '/test/' + addr1[i][j]+'.pkl' graph = open(graph_path, "rb") graph_test = open(graph_test, "rb") g_train.extend(pickle.load(graph)) g_test.extend(pickle.load(graph_test)) #print(type(st)) #print(st[1].node_tags) #print("每个文件的图数量:%d" %len(st)) #random.seed(1) #st=random.sample(st,m[j]) #st=random.sample(st,int(len(st)*0.1)) #g_list.extend(st) #print(len(st)) graph.close() graph_test.close() a = [] feature = feature.split(",") for i in range(len(feature)): if feature[i] != ',': a.append(int(feature[i])) print(f"feature: {feature[i]}") tagset.append({}) feature = a tagset, g_train = graph_add_features(g_train, degree_as_tag, feature, tagset) tagset, g_test = graph_add_features(g_test, degree_as_tag, feature, tagset) #g_train=one_hot(g_train,'train') #g_test=one_hot(g_test,'test') #g_train=feature_bin(g_train) #g_test=feature_bin(g_test) g_train = block_bin(g_train, tagset) g_test = block_bin(g_test, tagset) print('# maximum node tag: %d' % len(tagset)) print("# train data: %d" % len(g_train)) print('#test data: %d' % len(g_test)) return num_class, g_train, g_test def graph_add_features(g_list, degree_as_tag, feature, tagset): for g in g_list: g.neighbors = [[] for i in range(len(g.g))] for i, j in g.g.edges(): g.neighbors[i].append(j) g.neighbors[j].append(i) degree_list = [] for i in range(len(g.g)): g.neighbors[i] = g.neighbors[i] degree_list.append(len(g.neighbors[i])) g.max_neighbor = max(degree_list) #g.label = lable_dict[g.label] edges = [list(pair) for pair in g.g.edges()]#拓展边,无向边 edges.extend([[i, j] for j, i in edges]) deg_list = list(dict(g.g.degree(range(len(g.g)))).values()) g.edge_mat = torch.LongTensor(edges).transpose(0,1) if degree_as_tag: for g in g_list: g.node_tags = list(dict(g.g.degree).values()) #Extracting unique tag labels #tagset={} #tagset = set([]) for g in g_list: for j in range(len(g.node_tags)): if '0' in feature: g.node_tags[j][0]=int(g.node_tags[j][0]/100)*100 if '5' in feature: g.node_tags[j][5]=int(g.node[j][5]*100) m=[] for i in range(len(feature)): #print(f"len(g.node_tags: {len(g.node_tags[j])}") if(g.node_tags[j][feature[i]] not in tagset[i].keys()): tagset[i][g.node_tags[j][feature[i]]] = len(tagset[i]) m.append(tagset[i][g.node_tags[j][feature[i]]]) g.node_tags[j] = m #m=m+str(g.node_tags[j][int(feature[i])])+' ' #g.node_tags[j]=m #print(m) #if g.node_tags[j] not in tagset: #tagset[g.node_tags[j]]=len(tagset) #g.node_tags[j]=tagset[g.node_tags[j]] #print(tagset) #print(g.node_tags) #tagset = tagset.union(set(g.node_tags[fe])) #print(tagset) #print("union_end") #tagset = list(tagset) #tag2index = {tagset[i]:i for i in range(len(tagset))} #num_normal=0 #num_DDoS=0 return tagset, g_list def feature_bin(g_list, tagset): l = len(bin(len(tagset)).replace('0b','')) for g in g_list: g.node_features=torch.zeros(len(g.node_tags),l) for i in range(len(g.node_tags)): a=str(bin(g.node_tags[i])).replace('0b','') for j in range(len(a)): if(a[j]=='1'): g.node_features[i,l-len(a)+j]=1 #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags)) def block_bin(g_list, tagset): l=[] m=0 for i in range(len(tagset)): l.append(len(bin(len(tagset[i])).replace('0b',''))) m += l[i] #print(tagset) #print(m) for g in g_list: g.node_features=torch.zeros(len(g.node_tags), m) for j in range(len(g.node_tags)): for i in range(len(tagset)): a=str(bin(g.node_tags[j][i])).replace('ob','') for k in range(len(a)): if(a[k]=='1'): if(i != 0): g.node_features[j,l[i-1]+l[i]-len(a)+k]=1 else: g.node_features[j,l[i]-len(a)+k]=1 #print('g.node_feature:%s,g.node_tags:%s' % (g.node_features,g.node_tags)) return g_list def one_hot(g_list, ty, tagset): num_every={} for i in range(num_class): num_every[i]=0 for g in g_list: num_every[g.label]+=1 g.node_features = torch.zeros(len(g.node_tags), len(tagset)) g.node_features[range(len(g.node_tags)), [tagset[tag] for tag in g.node_tags]] = 1 #print("node_feature") #print("node_feature end") #num_DDoS=len(g_list)-num_normal for i in range(num_class): print("%s: class %d: %d" %(ty, i, num_every[i])) #print("num_normal:%d" %num_normal) #print("num_DDoS:%d" %num_DDoS ) #print("label_dict:%s" % lable_dict) #print('# classes: %d' %class_flow) return g_list def separate_data(graph_list, seed, fold_idx): assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9." skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed) #print(f"skf: {skf}") labels = [graph.label for graph in graph_list] idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): idx_list.append(idx) #print(f"idx:{idx}") #print(idx_list) train_idx, test_idx = idx_list[fold_idx] print(f"train_num: {len(train_idx)} \n test_num: {len(test_idx)}") train_graph_list = [graph_list[i] for i in train_idx] test_graph_list = [graph_list[i] for i in test_idx] return train_graph_list, test_graph_list