Upload New File

author: yifei cheng <[email protected]> 2023-06-26 12:33:31 +0000
committer: yifei cheng <[email protected]> 2023-06-26 12:33:31 +0000
commit: 4c7b82c46d0eea31ebd534c2b1fc5085d2526f67 (patch)
tree: b69564c191056086c814989beb8be1bc9233ed65
parent: 74793c2daba349aef2e07895379fa0f134f068a2 (diff)
1 files changed, 163 insertions, 0 deletions
diff --git a/evaluate/InformationGain.py b/evaluate/InformationGain.py
new file mode 100644
index 0000000..d878cee
--- /dev/null
+++ b/evaluate/InformationGain.py
@@ -0,0 +1,163 @@
+"""
+Date: 2022-04-13
+Author: [email protected]
+Desc: Calc the information gain of features
+"""
+import numpy as np
+import pandas as pd
+import json
+
+# class InformationGain():
+#     """
+#     信息增益
+#     """
+#     def __init__(self, X, y):
+#         self.X = X
+#         self.y = y
+#         self.totalSampleCount = X.shape[0]
+#         self.totalSystemEntropy = 0
+#         self.totalClassCountDict = {}
+#         self.nonzeroPostion = X.T.nonzero()
+#         self.igResult = []
+#         self.wordExistSampleCount = 0
+#         self.wordExistClassDict = {}
+#         self.iter()
+#
+#     def get_result(self):
+#         return self.igResult
+#
+#     def cal_total_system_entropy(self):
+#         # 计算每个类别有多少
+#         for label in self.y:
+#             if label not in self.totalClassCountDict:
+#                 self.totalClassCountDict[label] = 1
+#             else:
+#                 self.totalClassCountDict[label] += 1
+#         for cls in self.totalClassCountDict:
+#             probs = self.totalClassCountDict[cls] / float(self.totalSampleCount)
+#             self.totalSystemEntropy -= probs * np.log(probs)
+#
+#     # 遍历nonzeroPosition，逐步计算出每个wor的信息增益
+#     def iter(self):
+#         self.cal_total_system_entropy()
+#
+#         pre = 0
+#         for i in range(len(self.nonzeroPostion[0])):
+#             if i != 0 and self.nonzeroPostion[0][i] != pre:
+#                 for notappear in range(pre + 1, self.nonzeroPostion[0][i]):
+#                     self.igResult.append(0.0)
+#                 ig = self.cal_infomation_gain()
+#                 self.igResult.append(ig)
+#                 self.wordExistSampleCount = 0
+#                 self.wordExistClassDict = {}
+#                 pre = self.nonzeroPostion[0][i]
+#             self.wordExistSampleCount += 1
+#             yclass = self.y[self.nonzeroPostion[1][i]]
+#             if yclass not in self.wordExistClassDict:
+#                 self.wordExistClassDict[yclass] = 1
+#             else:
+#                 self.wordExistClassDict[yclass] += 1
+#         # 计算最后一个特征的ig
+#         ig = self.cal_infomation_gain()
+#         self.igResult.append(ig)
+#
+#     def cal_infomation_gain(self):
+#         x_exist_entropy = 0
+#         x_nonexist_entropy = 0
+#         for cls in self.wordExistClassDict:
+#             probs = self.wordExistClassDict[cls] / float(self.wordExistSampleCount)
+#             x_exist_entropy -= probs * np.log(probs)
+#             probs = (self.totalClassCountDict[cls] - self.wordExistClassDict[cls]) / float(self.totalSampleCount - self.wordExistSampleCount)
+#             if probs == 0:
+#                 x_nonexist_entropy = 0
+#             else:
+#                 x_nonexist_entropy -= probs * np.log(probs)
+#
+#         for cls in self.totalClassCountDict:
+#             if cls not in self.wordExistClassDict:
+#                 probs = self.totalClassCountDict[cls] / float(self.totalSampleCount - self.wordExistSampleCount)
+#                 x_nonexist_entropy -= probs * np.log(probs)
+#         ig = self.totalSystemEntropy - ((self.wordExistSampleCount / float(self.totalSampleCount)) * x_exist_entropy +
+#                                         ((self.totalSampleCount-self.wordExistSampleCount)/float(self.totalSampleCount)*x_nonexist_entropy))
+#         return ig
+
+import numpy as np
+import pandas as pd
+import math
+
+
+class InformationGain():
+    def __init__(self, feature, label):
+        feature = np.array(feature)
+        num_of_feature = np.shape(feature)[1]
+        num_of_label = len(label)
+        temp_ent = 0
+        temp_condition_ent = 0
+        information_gain_ratio = 0
+        shanno_ent = []
+        condition_ent = []
+        information_gain_list = []
+        information_gain_ratio_list = []
+
+        for i in set(label):
+            temp_ent += -(label.count(i) / num_of_label) * math.log(label.count(i) / num_of_label)
+        for i in range(num_of_feature):
+            feature1 = feature[:, i]
+            sorted_feature = sorted(feature1)
+            threshold = [(sorted_feature[inde - 1] + sorted_feature[inde]) / 2 for inde in range(len(feature1)) if
+                         inde != 0]
+            thre_set = set(threshold)
+            if float(max(feature1)) in thre_set:
+                thre_set.remove(float(max(feature1)))
+            if min(feature1) in thre_set:
+                thre_set.remove(min(feature1))
+            information_gain = 0
+            for thre in thre_set:
+                lower = [label[s] for s in range(len(feature1)) if feature1[s] < thre]
+                highter = [label[s] for s in range(len(feature1)) if feature1[s] > thre]
+                H_l = 0
+                for l in set(lower):
+                    H_l += -(lower.count(l) / len(lower)) * math.log(lower.count(l) / len(lower))
+                H_h = 0
+                for h in set(highter):
+                    H_h += -(highter.count(h) / len(highter)) * math.log(highter.count(h) / len(highter))
+                temp_condition_ent = len(lower) / num_of_label * H_l + len(highter) / num_of_label * H_h
+                gain = temp_ent - temp_condition_ent
+                information_gain = max(information_gain, gain)
+                information_gain_ratio = information_gain / temp_ent
+            shanno_ent.append(temp_ent)
+            condition_ent.append(temp_condition_ent)
+            information_gain_list.append(information_gain)
+            information_gain_ratio_list.append(information_gain_ratio)
+            self.shannoEnt = shanno_ent[0]
+            self.conditionEnt = condition_ent
+            self.InformationGain = information_gain_list
+            self.InformationGainRatio = information_gain_ratio_list
+    def getEnt(self):
+        return self.shannoEnt
+
+    def getConditionEnt(self):
+        return self.conditionEnt
+
+    def getInformationGain(self):
+        return self.InformationGain
+
+    def getInformationGainRatio(self):
+        return self.InformationGainRatio
+
+if __name__ == '__main__':
+    malwareFamily = ['Dridex', 'Gozi', 'Quakbot', 'Tofsee', 'TrickBot']
+    ig_data = {}
+    for cls in malwareFamily:
+        path = "/home/sunhanwu/datasets/MTA/cicflownpy/{}.npy".format(cls)
+        print("load {} data".format(cls))
+        data = np.load(path, allow_pickle=True)
+        data = pd.DataFrame(data)
+        X = data.iloc[:, :-1].values.tolist()
+        y = data.iloc[:, -1].values.tolist()
+        ig = InformationGain(X, y)
+        result = ig.getInformationGain()
+        print("calc {} information gain done.".format(cls))
+        ig_data[cls] = result
+    with open("/home/sunhanwu/work2021/TrafficAdversarial/experiment/src/result/MTA_Statistic_IG.json", 'w') as f:
+        json.dump(ig_data, f)
author	yifei cheng <[email protected]>	2023-06-26 12:33:31 +0000
committer	yifei cheng <[email protected]>	2023-06-26 12:33:31 +0000
commit	4c7b82c46d0eea31ebd534c2b1fc5085d2526f67 (patch)
tree	b69564c191056086c814989beb8be1bc9233ed65
parent	74793c2daba349aef2e07895379fa0f134f068a2 (diff)