summaryrefslogtreecommitdiff
path: root/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'util.py')
-rw-r--r--util.py261
1 files changed, 261 insertions, 0 deletions
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..d50027e
--- /dev/null
+++ b/util.py
@@ -0,0 +1,261 @@
+#coding:utf-8
+import networkx as nx
+import numpy as np
+import csv
+import random
+import torch
+import matplotlib.pyplot as plt
+from sklearn.model_selection import StratifiedKFold
+
+
+class S2VGraph(object):
+ def __init__(self, g, label, node_tags=None, node_features=None):
+ '''
+ g: a networkx graph
+ label: an integer graph label
+ node_tags: a list of integer node tags
+ node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
+ edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
+ neighbors: list of neighbors (without self-loop)
+ '''
+ self.label = label
+ self.g = g
+ self.node_tags = node_tags
+ self.neighbors = []
+ self.node_features = 0
+ self.edge_mat = 0
+
+ self.max_neighbor = 0
+
+def csv_dataset(addr,ip):
+ slow_body1=csv.reader(open('/home/liyuzhen/dataset/CIC-DoS/%s.csv' %addr,'r'))
+ slow_body1_DoS={}
+ slow_body1_normal={}
+ #norm=0
+ #DDoS=0
+ m=1
+ for row in slow_body1:
+ if(row[5]=='Length'):
+ continue
+ tmp_one_packet=[]
+ if(row[1]==ip):
+ #DDoS=DDoS+1
+ if(row[3] not in slow_body1_DoS):
+ slow_body1_DoS[row[3]]=[]
+ tmp_one_packet.append(row[4])#包大小
+ tmp_one_packet.append(float(row[2]))#时间
+ tmp_one_packet.append(int(row[5]))#协议
+ slow_body1_DoS[row[3]].append(tmp_one_packet)
+ elif(row[3]==ip):
+ #DDoS=DDoS+1
+ if(row[1] not in slow_body1_DoS):
+ slow_body1_DoS[row[1]]=[]
+ tmp_one_packet.append(row[4])#包大小
+ tmp_one_packet.append(float(row[2]))#时间
+ tmp_one_packet.append(int(row[5])*(-1))#协议
+ slow_body1_DoS[row[1]].append(tmp_one_packet)
+ elif(m):
+ m+=1
+ #norm=norm+1
+ if(((row[1]+' '+row[3]) not in slow_body1_normal) and ((row[3]+' '+row[1]) not in slow_body1_normal)):
+ slow_body1_normal[row[1]+' '+row[3]]=[]
+ a=row[1]+' '+row[3]
+ b=row[3]+' '+row[1]
+ if(a in slow_body1_normal.keys()):
+ tmp_one_packet.append(row[4])#包大小
+ tmp_one_packet.append(float(row[2]))#时间
+ tmp_one_packet.append(int(row[5])*(-1))#协议
+ slow_body1_normal[a].append(tmp_one_packet)
+ elif(b in slow_body1_normal.keys()):
+ tmp_one_packet.append(row[4])#包大小
+ tmp_one_packet.append(float(row[2]))#时间
+ tmp_one_packet.append(int(row[5]))#协议
+ slow_body1_normal[b].append(tmp_one_packet)
+ #print('%s——normal:%d' %(addr,norm))
+ #print('%s——DDoS:%d' %(addr,DDoS))
+
+ return slow_body1_DoS,slow_body1_normal#{ip:[size,time,pro],[],ip:[[],[],[]}
+
+def load_data(degree_as_tag,para):
+ '''
+ dataset: name of dataset
+ test_proportion: ratio of test train split
+ seed: random seed for random splitting of dataset
+ '''
+
+ print('loading data')
+
+ g_list=[]
+ lable_dict={}
+ all_csv_dict={}
+ addr=['1','2','3','4','5(1)','5(2)','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26']
+ tag=[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+ #tag=[0,2,4,1,1,3,2,1,2,1,3,1,2,2,2,2,4,3,1,1,3,3,3,3,1,1,2]
+ class_flow=2
+ ip=['75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72',
+ '74.63.40.21','75.127.97.72','97.74.144.108','208.113.162.153','69.84.133.138',
+ '67.220.214.50','97.74.144.108','69.192.24.88','97.74.144.108','203.73.24.75',
+ '97.74.144.108','74.55.1.4','97.74.104.201','74.55.1.4',
+ '69.192.24.88','97.74.144.108','97.74.144.108',
+ '75.127.97.72','75.127.97.72','69.192.24.88','75.127.97.72','74.55.1.4']
+ num_node=para['num_node']
+ all_csv_dict[0]=[]
+ for i in range(len(addr)):
+ if tag[i] not in all_csv_dict.keys():
+ all_csv_dict[tag[i]]=[]
+ DoS,normal=csv_dataset(addr[i],ip[i])
+ if(DoS!={}):
+ '''for key in list(DoS.keys()):
+ #print(DoS[key][0][0])
+ if(DoS[key][0][0]>0):
+ DoS.pop(key)'''
+
+ all_csv_dict[tag[i]].append(DoS)
+ #print(len(all_csv_dict[1]))
+ if(normal!={}):
+ '''for key in list(normal.keys()):
+ if(normal[key][0][0]>0):
+ normal.pop(key)'''
+ all_csv_dict[0].append(normal)
+
+ for tags in all_csv_dict.keys():
+ all_flow=all_csv_dict[tags]
+ for num in range(len(all_flow)):
+ slow_body1_DoS=all_flow[num]
+ for x in slow_body1_DoS.keys():
+ all_traffic=slow_body1_DoS[x]
+ num_graph=int(len(all_traffic)/num_node)
+ #print(num_graph)
+ last_graph=len(all_traffic)-num_node*num_graph
+ for y in range(num_graph):
+ g = nx.Graph()
+ node_first=0
+ node_last=0
+ node_tags = []
+ node_features = []
+ a=0
+ for z in range(num_node):
+ traffic=num_node*y+z
+
+ if(traffic!=0):
+ time=all_traffic[traffic][1]-all_traffic[traffic-1][1]
+ time=int(time*1)
+ else:
+ time=all_traffic[traffic][1]
+ time=int(time*1)
+ g.add_node(z)
+ #print(time)
+ if(time not in lable_dict ):
+ lable_dict[time]=len(lable_dict)
+ #print(lable_dict)
+ #lable_dict[all_traffic[traffic][1]]=len(lable_dict)
+ #print(all_traffic[traffic][1])
+ #print(time)
+ node_tags.append(lable_dict[time])
+ #node_tags.append(lable_dict[all_traffic[traffic][1]])
+ #node_features.append(all_traffic[traffic][0])
+ if(z>0):#构造图的边
+ if(all_traffic[traffic][2]*all_traffic[traffic-1][2]>0):
+ #print(1)
+ g.add_edge(z-1, z)
+ #nx.draw(g,with_labels=True,pos=nx.circular_layout(g))
+ #plt.show()
+ else:
+ #print(2)
+ g.add_edge(z,node_first)
+ #nx.draw(g,with_labels=True,edge_color='r',pos=nx.circular_layout(g))
+ a=a+1
+ #plt.show()
+
+ if(a>=2):
+ g.add_edge(node_last,z-1)
+ #nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g))
+ node_first=z
+ node_last=z-1
+ g.add_edge(z,node_last)
+ if node_features != []:
+ node_features = np.stack(node_features)
+ node_feature_flag = True
+ else:
+ node_features = None
+ node_feature_flag = False
+
+ #print(tags,node_tags)
+ g_list.append(S2VGraph(g, tags, node_tags))
+ #add labels and edge_mat
+ print(lable_dict)
+ print(len(lable_dict))
+ print('end:construct graph')
+ for g in g_list:#
+ g.neighbors = [[] for i in range(len(g.g))]
+ for i, j in g.g.edges():
+ g.neighbors[i].append(j)
+ g.neighbors[j].append(i)
+ degree_list = []
+ for i in range(len(g.g)):
+ g.neighbors[i] = g.neighbors[i]
+ degree_list.append(len(g.neighbors[i]))
+ g.max_neighbor = max(degree_list)#neighbor数量最多的节点
+
+ #g.label = lable_dict[g.label]
+
+ edges = [list(pair) for pair in g.g.edges()]#拓展边,无向边
+
+ edges.extend([[i, j] for j, i in edges])
+
+ deg_list = list(dict(g.g.degree(range(len(g.g)))).values())
+ g.edge_mat = torch.LongTensor(edges).transpose(0,1)
+
+ print('end_neighbors')
+ if degree_as_tag:
+ for g in g_list:
+ g.node_tags = list(dict(g.g.degree).values())
+
+ #Extracting unique tag labels
+
+ tagset = set([])
+ for g in g_list:
+ #print(g.node_tags)
+ tagset = tagset.union(set(g.node_tags))
+ #print(tagset)
+ print(len(lable_dict))
+ print(lable_dict)
+ tagset = list(tagset)
+ tag2index = {tagset[i]:i for i in range(len(tagset))}
+ num_normal=0
+ num_DDoS=0
+ for g in g_list:
+ if(g.label==0):
+ num_normal+=1
+ if(g.label==1):
+ num_DDoS+=1
+ g.node_features = torch.zeros(len(g.node_tags), len(tagset))
+ g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1
+
+ num_DDoS=len(g_list)-num_normal
+
+ print('# classes: %d' %class_flow)
+ print('# maximum node tag: %d' % len(tagset))
+
+ print("# data: %d" % len(g_list))
+ print("num_normal:%d" %num_normal)
+ print("num_DDoS:%d" %num_DDoS )
+
+ return g_list,class_flow
+
+def separate_data(graph_list, seed, fold_idx):
+ assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
+ skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed)
+
+ labels = [graph.label for graph in graph_list]
+ idx_list = []
+ for idx in skf.split(np.zeros(len(labels)), labels):
+ idx_list.append(idx)
+ train_idx, test_idx = idx_list[fold_idx]
+
+ train_graph_list = [graph_list[i] for i in train_idx]
+ test_graph_list = [graph_list[i] for i in test_idx]
+
+ return train_graph_list, test_graph_list
+
+