summaryrefslogtreecommitdiff
path: root/cicflow.py
blob: ae1bdea39ebb09f404c949a2731a9506e7c899b4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import csv
import os
import pandas as pd
import _pickle as pkl
import numpy as np


def merge_csv(input_dir="C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\OW\\web",
              save_filename="./result/ow_doh_features.csv", truncated_num=5, label=0):
    files = os.listdir(input_dir)
    # df = pd.DataFrame(columns=["features", "labels"])
    frames = []
    for filename in files:
        if not filename.endswith(".csv"):
            continue
        full_filename = os.path.join(input_dir, filename)
        df = pd.read_csv(full_filename)
        # print(len(df))
        frames.append(df)
    index = 0
    df = pd.concat(frames).fillna(1e10)
    for row in df.iloc[:, :-1].values.tolist():
        proto = row[5]
        if proto != 6:
            continue
        features = row[7:]
        if features[1] + features[2] < truncated_num or features[1] < 1e-5 or features[2] < 1e-5:
            continue
        index += 1

    # print(len(df))
    save_df = pd.DataFrame(columns=["features", "label"], index=range(index))
    index = 0
    for row in df.iloc[:, :-1].values.tolist():
        proto = row[5]
        if proto != 6:
            continue

        features = row[7:]
        features = features[0:3] + features[5:13] + features[37:41] + features[15:23] + features[24:28] + features[50:51]
        # print(type(features[1]))
        # print(row)
        # print(features,features[-1])
        if features[1] + features[2] < truncated_num or features[1] < 1e-5 or features[2] < 1e-5:
            continue
        for i in range(len(features)):
            feature = features[i]
            if isinstance(feature, str):
                # print(type(feature),feature)
                features[i] = float(feature)
                feature = float(feature)
            if np.isnan(feature) or np.isinf(feature) or not np.isfinite(feature):
                print(np.isnan(feature), np.isinf(feature), not np.isfinite(feature), feature)
                print(features)
                features[i] = 1e7
        save_df.loc[index] = [features, label]
        index += 1
        # print(index)
    # print(save_df)
    save_df.to_csv(save_filename)
    pkl_name = save_filename.replace("csv", "pkl")
    f_pkl = open(pkl_name, "wb")
    pkl.dump(save_df, f_pkl)
    f_pkl.close()


def merge_all_pkl():
    cw_doh_dataset = pkl.load(open("./result/cw_doh_features.pkl", "rb"))
    cw_web_dataset = pkl.load(open("./result/cw_web_features.pkl", "rb"))
    cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
    cw_voip_dataset = pkl.load(open("./result/cw_voip_features.pkl", "rb"))
    cw_chat_dataset = pkl.load(open("./result/cw_chat_features.pkl", "rb"))
    cw_email_dataset = pkl.load(open("./result/cw_email_features.pkl", "rb"))
    cw_streaming_dataset = pkl.load(open("./result/cw_streaming_features.pkl", "rb"))

    # cw_web_dataset['label'] = cw_web_dataset['label'].map(lambda x: 1)
    # cw_web_dataset.to_csv("./result/cw_web_features.pkl")
    #
    # cw_file_dataset['label'] = cw_file_dataset['label'].map(lambda x: 2)
    # cw_file_dataset.to_csv("./result/cw_file_features.pkl")
    #
    # cw_voip_dataset['label'] = cw_voip_dataset['label'].map(lambda x: 3)
    # cw_voip_dataset.to_csv("./result/cw_voip_features.pkl")
    #
    # cw_chat_dataset['label'] = cw_chat_dataset['label'].map(lambda x: 4)
    # cw_chat_dataset.to_csv("./result/cw_chat_features.pkl")
    #
    # cw_email_dataset['label'] = cw_email_dataset['label'].map(lambda x: 5)
    # cw_email_dataset.to_csv("./result/cw_email_features.pkl")
    #
    # cw_streaming_dataset['label'] = cw_streaming_dataset['label'].map(lambda x: 6)
    # cw_streaming_dataset.to_csv("./result/cw_streaming_features.pkl")
    frames = [cw_doh_dataset, cw_web_dataset, cw_chat_dataset, cw_email_dataset, cw_streaming_dataset, cw_file_dataset,
              cw_voip_dataset]
    df = pd.concat(frames)
    save_filename = "./result/all_features.csv"
    df.to_csv(save_filename)
    pkl_name = save_filename.replace("csv", "pkl")
    f_pkl = open(pkl_name, "wb")
    pkl.dump(df, f_pkl)
    f_pkl.close()


if __name__ == '__main__':
    input_and_output_tuple = []
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\doh", "./result/doh_features.csv"))
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\web", "./result/web_features.csv"))
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\File", "./result/file_features.csv"))
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\Email", "./result/email_features.csv"))
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\VoIP", "./result/voip_features.csv"))
    input_and_output_tuple.append(
        ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\Chat", "./result/chat_features.csv"))

    label = 0
    for input_dir, save_filename in input_and_output_tuple:
        print(input_dir)
        print(save_filename)
        truncated_num = 5
        merge_csv(input_dir, save_filename, truncated_num, label=label)
        label += 1
    # merge_all_pkl()