1 files changed, 261 insertions, 0 deletions
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..d50027e
--- /dev/null
+++ b/util.py
@@ -0,0 +1,261 @@
+#coding:utf-8
+import networkx as nx
+import numpy as np
+import csv
+import random
+import torch
+import matplotlib.pyplot as plt
+from sklearn.model_selection import StratifiedKFold
+
+
+class S2VGraph(object):
+    def __init__(self, g, label, node_tags=None, node_features=None):
+        '''
+            g: a networkx graph
+            label: an integer graph label
+            node_tags: a list of integer node tags
+            node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
+            edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
+            neighbors: list of neighbors (without self-loop)
+        '''
+        self.label = label
+        self.g = g
+        self.node_tags = node_tags
+        self.neighbors = []
+        self.node_features = 0
+        self.edge_mat = 0
+
+        self.max_neighbor = 0
+
+def csv_dataset(addr,ip):
+    slow_body1=csv.reader(open('/home/liyuzhen/dataset/CIC-DoS/%s.csv' %addr,'r'))
+    slow_body1_DoS={}
+    slow_body1_normal={}
+    #norm=0
+    #DDoS=0
+    m=1
+    for row in slow_body1:
+        if(row[5]=='Length'):
+            continue
+        tmp_one_packet=[]
+        if(row[1]==ip):
+            #DDoS=DDoS+1
+            if(row[3] not in slow_body1_DoS):
+                slow_body1_DoS[row[3]]=[]
+            tmp_one_packet.append(row[4])#包大小
+            tmp_one_packet.append(float(row[2]))#时间
+            tmp_one_packet.append(int(row[5]))#协议
+            slow_body1_DoS[row[3]].append(tmp_one_packet)
+        elif(row[3]==ip):
+            #DDoS=DDoS+1
+            if(row[1] not in slow_body1_DoS):
+                slow_body1_DoS[row[1]]=[]
+            tmp_one_packet.append(row[4])#包大小
+            tmp_one_packet.append(float(row[2]))#时间
+            tmp_one_packet.append(int(row[5])*(-1))#协议
+            slow_body1_DoS[row[1]].append(tmp_one_packet)
+        elif(m):
+            m+=1
+            #norm=norm+1
+            if(((row[1]+' '+row[3]) not in slow_body1_normal) and ((row[3]+' '+row[1]) not in slow_body1_normal)):
+                slow_body1_normal[row[1]+' '+row[3]]=[]
+            a=row[1]+' '+row[3]
+            b=row[3]+' '+row[1]
+            if(a in slow_body1_normal.keys()):
+                tmp_one_packet.append(row[4])#包大小
+                tmp_one_packet.append(float(row[2]))#时间
+                tmp_one_packet.append(int(row[5])*(-1))#协议
+                slow_body1_normal[a].append(tmp_one_packet)
+            elif(b in slow_body1_normal.keys()):
+                tmp_one_packet.append(row[4])#包大小
+                tmp_one_packet.append(float(row[2]))#时间
+                tmp_one_packet.append(int(row[5]))#协议
+                slow_body1_normal[b].append(tmp_one_packet)
+    #print('%s——normal:%d' %(addr,norm))
+    #print('%s——DDoS:%d' %(addr,DDoS))
+
+    return slow_body1_DoS,slow_body1_normal#{ip:[size,time,pro],[],ip:[[],[],[]}
+
+def load_data(degree_as_tag,para):
+    '''
+        dataset: name of dataset
+        test_proportion: ratio of test train split
+        seed: random seed for random splitting of dataset
+    '''
+
+    print('loading data')
+
+    g_list=[]
+    lable_dict={}
+    all_csv_dict={}
+    addr=['1','2','3','4','5(1)','5(2)','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26']
+    tag=[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    #tag=[0,2,4,1,1,3,2,1,2,1,3,1,2,2,2,2,4,3,1,1,3,3,3,3,1,1,2]
+    class_flow=2
+    ip=['75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72','75.127.97.72',
+        '74.63.40.21','75.127.97.72','97.74.144.108','208.113.162.153','69.84.133.138',
+        '67.220.214.50','97.74.144.108','69.192.24.88','97.74.144.108','203.73.24.75',
+        '97.74.144.108','74.55.1.4','97.74.104.201','74.55.1.4',
+        '69.192.24.88','97.74.144.108','97.74.144.108',
+        '75.127.97.72','75.127.97.72','69.192.24.88','75.127.97.72','74.55.1.4']
+    num_node=para['num_node']
+    all_csv_dict[0]=[]
+    for i in range(len(addr)):
+        if tag[i] not in all_csv_dict.keys():
+            all_csv_dict[tag[i]]=[]
+        DoS,normal=csv_dataset(addr[i],ip[i])
+        if(DoS!={}):
+            '''for key in list(DoS.keys()):
+                #print(DoS[key][0][0])
+                if(DoS[key][0][0]>0):
+                    DoS.pop(key)'''
+
+            all_csv_dict[tag[i]].append(DoS)
+            #print(len(all_csv_dict[1]))
+        if(normal!={}):
+            '''for key in list(normal.keys()):
+                if(normal[key][0][0]>0):
+                    normal.pop(key)'''
+            all_csv_dict[0].append(normal)
+
+    for tags in all_csv_dict.keys():
+        all_flow=all_csv_dict[tags]
+        for num in range(len(all_flow)):
+            slow_body1_DoS=all_flow[num]
+            for x in slow_body1_DoS.keys():
+                all_traffic=slow_body1_DoS[x]
+                num_graph=int(len(all_traffic)/num_node)
+                #print(num_graph)
+                last_graph=len(all_traffic)-num_node*num_graph
+                for y in range(num_graph):
+                    g = nx.Graph()
+                    node_first=0
+                    node_last=0
+                    node_tags = []
+                    node_features = []
+                    a=0
+                    for z in range(num_node):
+                        traffic=num_node*y+z
+                 
+                        if(traffic!=0):
+                            time=all_traffic[traffic][1]-all_traffic[traffic-1][1]
+                            time=int(time*1)
+                        else:
+                            time=all_traffic[traffic][1]
+                            time=int(time*1)
+                        g.add_node(z)
+                        #print(time)
+                        if(time not in lable_dict ):
+                            lable_dict[time]=len(lable_dict)    
+                        #print(lable_dict)
+                        #lable_dict[all_traffic[traffic][1]]=len(lable_dict)
+                        #print(all_traffic[traffic][1])
+                        #print(time)
+                        node_tags.append(lable_dict[time])
+                        #node_tags.append(lable_dict[all_traffic[traffic][1]])
+                        #node_features.append(all_traffic[traffic][0])
+                        if(z>0):#构造图的边
+                            if(all_traffic[traffic][2]*all_traffic[traffic-1][2]>0):
+                                #print(1)
+                                g.add_edge(z-1, z)
+                                #nx.draw(g,with_labels=True,pos=nx.circular_layout(g))
+                                #plt.show()
+                            else:
+                                #print(2)
+                                g.add_edge(z,node_first)
+                                #nx.draw(g,with_labels=True,edge_color='r',pos=nx.circular_layout(g))
+                                a=a+1
+                                #plt.show()
+
+                                if(a>=2):
+                                    g.add_edge(node_last,z-1)
+                                    #nx.draw(g,with_labels=True,edge_color='b',pos=nx.circular_layout(g))
+                                node_first=z
+                                node_last=z-1
+                    g.add_edge(z,node_last)
+                    if node_features != []:
+                        node_features = np.stack(node_features)
+                        node_feature_flag = True
+                    else:
+                        node_features = None
+                        node_feature_flag = False
+
+                    #print(tags,node_tags)
+                    g_list.append(S2VGraph(g, tags, node_tags))
+    #add labels and edge_mat
+    print(lable_dict)
+    print(len(lable_dict))
+    print('end:construct graph')
+    for g in g_list:#
+        g.neighbors = [[] for i in range(len(g.g))]
+        for i, j in g.g.edges():
+            g.neighbors[i].append(j)
+            g.neighbors[j].append(i)
+        degree_list = []
+        for i in range(len(g.g)):
+            g.neighbors[i] = g.neighbors[i]
+            degree_list.append(len(g.neighbors[i]))
+        g.max_neighbor = max(degree_list)#neighbor数量最多的节点
+
+        #g.label = lable_dict[g.label]
+
+        edges = [list(pair) for pair in g.g.edges()]#拓展边，无向边
+
+        edges.extend([[i, j] for j, i in edges])
+
+        deg_list = list(dict(g.g.degree(range(len(g.g)))).values())
+        g.edge_mat = torch.LongTensor(edges).transpose(0,1)
+
+    print('end_neighbors')
+    if degree_as_tag:
+        for g in g_list:
+            g.node_tags = list(dict(g.g.degree).values())
+
+    #Extracting unique tag labels
+
+    tagset = set([])
+    for g in g_list:
+        #print(g.node_tags)
+        tagset = tagset.union(set(g.node_tags))
+        #print(tagset)
+    print(len(lable_dict))
+    print(lable_dict)
+    tagset = list(tagset)
+    tag2index = {tagset[i]:i for i in range(len(tagset))}
+    num_normal=0
+    num_DDoS=0
+    for g in g_list:
+        if(g.label==0):
+            num_normal+=1
+        if(g.label==1):
+            num_DDoS+=1
+        g.node_features = torch.zeros(len(g.node_tags), len(tagset))
+        g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1
+
+    num_DDoS=len(g_list)-num_normal
+
+    print('# classes: %d' %class_flow)
+    print('# maximum node tag: %d' % len(tagset))
+
+    print("# data: %d" % len(g_list))
+    print("num_normal:%d" %num_normal)
+    print("num_DDoS:%d" %num_DDoS )
+
+    return g_list,class_flow
+
+def separate_data(graph_list, seed, fold_idx):
+    assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
+    skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed)
+
+    labels = [graph.label for graph in graph_list]
+    idx_list = []
+    for idx in skf.split(np.zeros(len(labels)), labels):
+        idx_list.append(idx)
+    train_idx, test_idx = idx_list[fold_idx]
+
+    train_graph_list = [graph_list[i] for i in train_idx]
+    test_graph_list = [graph_list[i] for i in test_idx]
+
+    return train_graph_list, test_graph_list
+
+