diff options
Diffstat (limited to 'util.py')
| -rw-r--r-- | util.py | 261 |
1 files changed, 261 insertions, 0 deletions
@@ -0,0 +1,261 @@ +#coding:utf-8 +import networkx as nx +import numpy as np +import csv +import random +import torch +import matplotlib.pyplot as plt +from sklearn.model_selection import StratifiedKFold + + +class S2VGraph(object): + def __init__(self, g, label, node_tags=None, node_features=None): + ''' + g: a networkx graph + label: an integer graph label + node_tags: a list of integer node tags + node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets + edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor + neighbors: list of neighbors (without self-loop) + ''' + self.label = label + self.g = g + self.node_tags = node_tags + self.neighbors = [] + self.node_features = 0 + self.edge_mat = 0 + + self.max_neighbor = 0 + +def csv_dataset(addr,ip): + slow_body1=csv.reader(open('/home/liyuzhen/dataset/CIC-DoS/%s.csv' %addr,'r')) + slow_body1_DoS={} + slow_body1_normal={} + #norm=0 + #DDoS=0 + m=1 + for row in slow_body1: + if(row[5]=='Length'): + continue + tmp_one_packet=[] + if(row[1]==ip): + #DDoS=DDoS+1 + if(row[3] not in slow_body1_DoS): + slow_body1_DoS[row[3]]=[] + tmp_one_packet.append(row[4])#包大小 + tmp_one_packet.append(float(row[2]))#时间 + tmp_one_packet.append(int(row[5]))#协议 + slow_body1_DoS[row[3]].append(tmp_one_packet) + elif(row[3]==ip): + #DDoS=DDoS+1 + if(row[1] not in slow_body1_DoS): + slow_body1_DoS[row[1]]=[] + tmp_one_packet.append(row[4])#包大小 + tmp_one_packet.append(float(row[2]))#时间 + tmp_one_packet.append(int(row[5])*(-1))#协议 + slow_body1_DoS[row[1]].append(tmp_one_packet) + elif(m): + m+=1 + #norm=norm+1 + if(((row[1]+' '+row[3]) not in slow_body1_normal) and ((row[3]+' '+row[1]) not in slow_body1_normal)): + slow_body1_normal[row[1]+' '+row[3]]=[] + a=row[1]+' '+row[3] + b=row[3]+' '+row[1] + if(a in slow_body1_normal.keys()): + tmp_one_packet.append(row[4])#包大小 + tmp_one_packet.append(float(row[2]))#时间 + tmp_one_packet.append(int(row[5])*(-1))#协议 + slow_body1_normal[a].append(tmp_one_packet) + elif(b in slow_body1_normal.keys()): + tmp_one_packet.append(row[4])#包大小 + tmp_one_packet.append(float(row[2]))#时间 + tmp_one_packet.append(int(row[5]))#协议 + slow_body1_normal[b].append(tmp_one_packet) + #print('%s——normal:%d' %(addr,norm)) + #print('%s——DDoS:%d' %(addr,DDoS)) + + return slow_body1_DoS,slow_body1_normal#{ip:[size,time,pro],[],ip:[[],[],[]} + +def load_data(degree_as_tag,para): + ''' + dataset: name of dataset + test_proportion: ratio of test train split + seed: random seed for random splitting of dataset + ''' + + print('loading data') + + g_list=[] + lable_dict={} + all_csv_dict={} + addr=['1','2','3','4','5(1)','5(2)','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26'] + tag=[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #tag=[0,2,4,1,1,3,2,1,2,1,3,1,2,2,2,2,4,3,1,1,3,3,3,3,1,1,2] + class_flow=2 + ip=['75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72', + '74.63.40.21','75.127.97.72','97.74.144.108','208.113.162.153','69.84.133.138', + '67.220.214.50','97.74.144.108','69.192.24.88','97.74.144.108','203.73.24.75', + '97.74.144.108','74.55.1.4','97.74.104.201','74.55.1.4', + '69.192.24.88','97.74.144.108','97.74.144.108', + '75.127.97.72','75.127.97.72','69.192.24.88','75.127.97.72','74.55.1.4'] + num_node=para['num_node'] + all_csv_dict[0]=[] + for i in range(len(addr)): + if tag[i] not in all_csv_dict.keys(): + all_csv_dict[tag[i]]=[] + DoS,normal=csv_dataset(addr[i],ip[i]) + if(DoS!={}): + '''for key in list(DoS.keys()): + #print(DoS[key][0][0]) + if(DoS[key][0][0]>0): + DoS.pop(key)''' + + all_csv_dict[tag[i]].append(DoS) + #print(len(all_csv_dict[1])) + if(normal!={}): + '''for key in list(normal.keys()): + if(normal[key][0][0]>0): + normal.pop(key)''' + all_csv_dict[0].append(normal) + + for tags in all_csv_dict.keys(): + all_flow=all_csv_dict[tags] + for num in range(len(all_flow)): + slow_body1_DoS=all_flow[num] + for x in slow_body1_DoS.keys(): + all_traffic=slow_body1_DoS[x] + num_graph=int(len(all_traffic)/num_node) + #print(num_graph) + last_graph=len(all_traffic)-num_node*num_graph + for y in range(num_graph): + g = nx.Graph() + node_first=0 + node_last=0 + node_tags = [] + node_features = [] + a=0 + for z in range(num_node): + traffic=num_node*y+z + + if(traffic!=0): + time=all_traffic[traffic][1]-all_traffic[traffic-1][1] + time=int(time*1) + else: + time=all_traffic[traffic][1] + time=int(time*1) + g.add_node(z) + #print(time) + if(time not in lable_dict ): + lable_dict[time]=len(lable_dict) + #print(lable_dict) + #lable_dict[all_traffic[traffic][1]]=len(lable_dict) + #print(all_traffic[traffic][1]) + #print(time) + node_tags.append(lable_dict[time]) + #node_tags.append(lable_dict[all_traffic[traffic][1]]) + #node_features.append(all_traffic[traffic][0]) + if(z>0):#构造图的边 + if(all_traffic[traffic][2]*all_traffic[traffic-1][2]>0): + #print(1) + g.add_edge(z-1, z) + #nx.draw(g,with_labels=True,pos=nx.circular_layout(g)) + #plt.show() + else: + #print(2) + g.add_edge(z,node_first) + #nx.draw(g,with_labels=True,edge_color='r',pos=nx.circular_layout(g)) + a=a+1 + #plt.show() + + if(a>=2): + g.add_edge(node_last,z-1) + #nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g)) + node_first=z + node_last=z-1 + g.add_edge(z,node_last) + if node_features != []: + node_features = np.stack(node_features) + node_feature_flag = True + else: + node_features = None + node_feature_flag = False + + #print(tags,node_tags) + g_list.append(S2VGraph(g, tags, node_tags)) + #add labels and edge_mat + print(lable_dict) + print(len(lable_dict)) + print('end:construct graph') + for g in g_list:# + g.neighbors = [[] for i in range(len(g.g))] + for i, j in g.g.edges(): + g.neighbors[i].append(j) + g.neighbors[j].append(i) + degree_list = [] + for i in range(len(g.g)): + g.neighbors[i] = g.neighbors[i] + degree_list.append(len(g.neighbors[i])) + g.max_neighbor = max(degree_list)#neighbor数量最多的节点 + + #g.label = lable_dict[g.label] + + edges = [list(pair) for pair in g.g.edges()]#拓展边,无向边 + + edges.extend([[i, j] for j, i in edges]) + + deg_list = list(dict(g.g.degree(range(len(g.g)))).values()) + g.edge_mat = torch.LongTensor(edges).transpose(0,1) + + print('end_neighbors') + if degree_as_tag: + for g in g_list: + g.node_tags = list(dict(g.g.degree).values()) + + #Extracting unique tag labels + + tagset = set([]) + for g in g_list: + #print(g.node_tags) + tagset = tagset.union(set(g.node_tags)) + #print(tagset) + print(len(lable_dict)) + print(lable_dict) + tagset = list(tagset) + tag2index = {tagset[i]:i for i in range(len(tagset))} + num_normal=0 + num_DDoS=0 + for g in g_list: + if(g.label==0): + num_normal+=1 + if(g.label==1): + num_DDoS+=1 + g.node_features = torch.zeros(len(g.node_tags), len(tagset)) + g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1 + + num_DDoS=len(g_list)-num_normal + + print('# classes: %d' %class_flow) + print('# maximum node tag: %d' % len(tagset)) + + print("# data: %d" % len(g_list)) + print("num_normal:%d" %num_normal) + print("num_DDoS:%d" %num_DDoS ) + + return g_list,class_flow + +def separate_data(graph_list, seed, fold_idx): + assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9." + skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed) + + labels = [graph.label for graph in graph_list] + idx_list = [] + for idx in skf.split(np.zeros(len(labels)), labels): + idx_list.append(idx) + train_idx, test_idx = idx_list[fold_idx] + + train_graph_list = [graph_list[i] for i in train_idx] + test_graph_list = [graph_list[i] for i in test_idx] + + return train_graph_list, test_graph_list + + |
