diff options
| author | wujiating <[email protected]> | 2022-05-19 06:44:13 +0000 |
|---|---|---|
| committer | wujiating <[email protected]> | 2022-05-19 06:44:13 +0000 |
| commit | 6f551d1a1b55ca519fb048949e7b3ce74659c851 (patch) | |
| tree | 20328ffee00c5538a7e31dc93258aaca8a3a51e3 /binary_cross.py | |
| parent | 83dc5e4822e2223b5166353e905bba83eee9b1e9 (diff) | |
| parent | 8165bf52b6e96471248da4d6780dcbbe7be65b02 (diff) | |
abc
See merge request wujiating/detection!1
Diffstat (limited to 'binary_cross.py')
| -rw-r--r-- | binary_cross.py | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/binary_cross.py b/binary_cross.py new file mode 100644 index 0000000..4516c58 --- /dev/null +++ b/binary_cross.py @@ -0,0 +1,222 @@ +import pandas as pd
+from sklearn.model_selection import StratifiedKFold
+from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix
+import sys
+import _pickle as pkl
+import numpy as np
+import warnings
+warnings.filterwarnings("ignore")
+
+features_name = [
+ "Flow Duration",
+ "Total Fwd Packet",
+ "Total Bwd packets",
+ "Total Length of Fwd Packet",
+ "Total Length of Bwd Packet",
+ "Fwd Packet Length Max",
+ "Fwd Packet Length Min",
+ "Fwd Packet Length Mean",
+ "Fwd Packet Length Std",
+ "Bwd Packet Length Max",
+ "Bwd Packet Length Min",
+ "Bwd Packet Length Mean",
+ "Bwd Packet Length Std",
+ "Flow Bytes/s",
+ "Flow Packets/s",
+ "Flow IAT Mean",
+ "Flow IAT Std",
+ "Flow IAT Max",
+ "Flow IAT Min",
+ "Fwd IAT Total",
+ "Fwd IAT Mean",
+ "Fwd IAT Std",
+ "Fwd IAT Max",
+ "Fwd IAT Min",
+ "Bwd IAT Total",
+ "Bwd IAT Mean",
+ "Bwd IAT Std",
+ "Bwd IAT Max",
+ "Bwd IAT Min",
+ "Fwd PSH Flags",
+ "Bwd PSH Flags",
+ "Fwd URG Flags",
+ "Bwd URG Flags",
+ "Fwd Header Length",
+ "Bwd Header Length",
+ "Fwd Packets/s",
+ "Bwd Packets/s",
+ "Packet Length Min",
+ "Packet Length Max",
+ "Packet Length Mean",
+ "Packet Length Std",
+ "Packet Length Variance",
+ "FIN Flag Count",
+ "SYN Flag Count",
+ "RST Flag Count",
+ "PSH Flag Count",
+ "ACK Flag Count",
+ "URG Flag Count",
+ "CWR Flag Count",
+ "ECE Flag Count",
+ "Down/Up Ratio",
+ "Average Packet Size",
+ "Fwd Segment Size Avg",
+ "Bwd Segment Size Avg",
+ "Fwd Bytes/Bulk Avg",
+ "Fwd Packet/Bulk Avg",
+ "Fwd Bulk Rate Avg",
+ "Bwd Bytes/Bulk Avg",
+ "Bwd Packet/Bulk Avg",
+ "Bwd Bulk Rate Avg",
+ "Subflow Fwd Packets",
+ "Subflow Fwd Bytes",
+ "Subflow Bwd Packets",
+ "Subflow Bwd Bytes",
+ "FWD Init Win Bytes",
+ "Bwd Init Win Bytes",
+ "Fwd Act Data Pkts",
+ "Fwd Seg Size Min",
+ "Active Mean",
+ "Active Std",
+ "Active Max",
+ "Active Min",
+ "Idle Mean",
+ "Idle Std",
+ "Idle Max",
+ "Idle Min",
+]
+
+
+def print_important_feature(sort_index, num=10):
+ print("top important feature is:")
+ for index in sort_index[:num]:
+ print(features_name[index])
+
+
+def random_forest(train, test, test_ow="ndarray"):
+ X = train.features.tolist()
+ Y = train.label.tolist()
+ test_X = test.features.tolist()
+ # print(len(X), len(Y))
+ # print(len(X[0]))
+ nb = RandomForestClassifier()
+ nb.fit(X, Y)
+ # importance = nb.feature_importances_
+ # sort_index = np.flipud(importance.argsort())
+ # print_important_feature(sort_index)
+ pred_ret = nb.predict(test_X)
+ if not isinstance(test_ow, str):
+ ow_X = test_ow.features.tolist()
+ return pred_ret, nb.predict(ow_X)
+ else:
+ return pred_ret
+
+
+def naive_bayesian(train, test, test_ow="ndarray"):
+ X = train.features.tolist()
+ Y = train.label.tolist()
+ test_X = test.features.tolist()
+ # print(len(X), len(Y))
+ # print(len(X[0]))
+ rf = GaussianNB()
+ rf.fit(X, Y)
+ # importance = rf.feature_importances_
+ # sort_index = np.flipud(importance.argsort())
+ # print_important_feature(sort_index)
+ pred_ret = rf.predict(test_X)
+ if not isinstance(test_ow, str):
+ ow_X = test_ow.features.tolist()
+ return pred_ret, rf.predict(ow_X)
+ else:
+ return pred_ret
+
+
+def decision_tree_classifier(train, test, test_ow="ndarray"):
+ X = train.features.tolist()
+ Y = train.label.tolist()
+ test_X = test.features.tolist()
+ # print(len(X), len(Y))
+ # print(len(X[0]))
+ dt = DecisionTreeClassifier()
+ dt.fit(X, Y)
+ # importance = rf.feature_importances_
+ # sort_index = np.flipud(importance.argsort())
+ # print_important_feature(sort_index)
+ pred_ret = dt.predict(test_X)
+ if not isinstance(test_ow, str):
+ ow_X = test_ow.features.tolist()
+ return pred_ret, dt.predict(ow_X)
+ else:
+ return pred_ret
+
+
+def svm_classifier(train, test, test_ow="ndarray"):
+ X = train.features.tolist()
+ Y = train.label.tolist()
+ test_X = test.features.tolist()
+ # print(len(X), len(Y))
+ # print(len(X[0]))
+ svm = SVC()
+ svm.fit(X, Y)
+ # importance = rf.feature_importances_
+ # sort_index = np.flipud(importance.argsort())
+ # print_important_feature(sort_index)
+ pred_ret = svm.predict(test_X)
+ if not isinstance(test_ow, str):
+ ow_X = test_ow.features.tolist()
+ return pred_ret, svm.predict(ow_X)
+ else:
+ return pred_ret
+
+
+if __name__ == "__main__":
+ kf = StratifiedKFold(n_splits=5, shuffle=True)
+ # for file in ["./result/web_features.pkl", "./result/chat_features.pkl", "./result/email_features.pkl",
+ # "./result/voip_features.pkl", "./result/file_features.pkl"]:
+ doh_dataset = pkl.load(open("./result/doh_features.pkl", "rb"))
+ web_dataset = pkl.load(open("./result/web_features.pkl", "rb"))
+ chat_features = pkl.load(open("./result/chat_features.pkl", "rb"))
+ email_features = pkl.load(open("./result/email_features.pkl", "rb"))
+ voip_features = pkl.load(open("./result/voip_features.pkl", "rb"))
+ file_features = pkl.load(open("./result/file_features.pkl", "rb"))
+
+ doh_dataset['label'] = doh_dataset['label'].map(lambda x: 0)
+ web_dataset['label'] = web_dataset['label'].map(lambda x: 1)
+ chat_features['label'] = chat_features['label'].map(lambda x: 1)
+ email_features['label'] = email_features['label'].map(lambda x: 1)
+ voip_features['label'] = voip_features['label'].map(lambda x: 1)
+ file_features['label'] = file_features['label'].map(lambda x: 1)
+
+ # doh_dataset = doh_dataset.sample(min(len(web_dataset), len(doh_dataset) * 1))
+ # web_dataset = web_dataset.sample(min(len(web_dataset), len(doh_dataset) * 1))
+ # print("数据集组成如下:")
+ # print(f"封闭数据集中正负样本比例为1:{len(web_dataset) // len(doh_dataset)},"
+ # f"正样本数量为{len(doh_dataset)},负样本数量为{len(web_dataset)}")
+
+ print("load data suc!")
+ # cw_dataset = pd.concat([web_dataset, doh_dataset])
+ # ow_dataset = pd.concat([ow_web_dataset, ow_doh_dataset])
+ classify = random_forest
+
+ for negativeSet in [web_dataset, chat_features, email_features, voip_features, file_features]:
+ for testSet in [web_dataset, chat_features, email_features, voip_features, file_features]:
+ if len(negativeSet) == len(testSet):
+ continue
+ positiveSet = doh_dataset.sample(min(len(negativeSet), len(doh_dataset)))
+ negativeSet = negativeSet.sample(min(len(negativeSet), len(doh_dataset)))
+ trainSet = pd.concat([positiveSet, negativeSet])
+ predict_results = classify(trainSet, testSet)
+ gt_Y = testSet.label.tolist()
+ precision = precision_score(gt_Y, predict_results, pos_label=1, average="binary")
+ recall = recall_score(gt_Y, predict_results, pos_label=1, average="binary")
+ f1 = f1_score(gt_Y, predict_results, pos_label=1, average="binary")
+ acc = accuracy_score(gt_Y, predict_results)
+ print(confusion_matrix(gt_Y, predict_results))
+ print("封闭测试集准确率: ", precision, end="\t")
+ print("封闭测试集召回率: ", recall, end="\t")
+ print("封闭测试集f1值: ", f1, end="\t")
+ print("封闭测试集acc: ", acc)
|
