summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJinghua <[email protected]>2023-06-25 00:09:54 +0800
committerJinghua <[email protected]>2023-06-25 00:09:54 +0800
commit834734a03e00945fc5f4355ad81a76c8d607d4d0 (patch)
tree0b74ba445466e3923f919a67d3b0624b33afbe82
parent52db9070111000eea934711deec44f86f4784651 (diff)
23-0625
-rw-r--r--KM-SVM/KM-SVM.py326
-rw-r--r--KM-SVM/kmeans.py192
-rw-r--r--KM-SVM/particle.py103
-rw-r--r--KM-SVM/pso.py63
-rw-r--r--KM-SVM/readme.md1
5 files changed, 685 insertions, 0 deletions
diff --git a/KM-SVM/KM-SVM.py b/KM-SVM/KM-SVM.py
new file mode 100644
index 0000000..9c4cc77
--- /dev/null
+++ b/KM-SVM/KM-SVM.py
@@ -0,0 +1,326 @@
+import itertools
+import time
+from sklearn.preprocessing import normalize
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import train_test_split
+from sklearn import svm
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import sklearn.metrics as sm
+from scipy.spatial.distance import cdist
+
+label = ['BENIGN', 'Bot', 'DDos', 'GlodenEye', 'Dos Hulk',
+ 'Slowhttp', 'SSH', 'FTP', 'PortScan', 'slowloris', 'BruteForce', 'XSS']
+
+
+def HandleData(path):
+ list_dir = os.listdir(path)
+ fd_data = []
+ for it in list_dir:
+ data = pd.read_csv(path + '/' + it)
+ fd_data.append(data)
+ data = pd.concat([fd_data[0], fd_data[1]])
+ for it in range(2, len(fd_data)):
+ data = pd.concat([data, fd_data[it]])
+ data = data.dropna(axis=0, how='any')
+ data = data.replace(',,', np.nan, inplace=False)
+ data.replace("Infinity", 0, inplace=True)
+
+ data.replace('Infinity', 0.0, inplace=True)
+ data.replace('NaN', 0.0, inplace=True)
+ data = data.replace([np.inf, -np.inf], np.nan)
+ data = data.dropna(axis=0, how='any')
+ n_row, n_col = data.shape
+ print('row:', n_row, 'col:', n_col)
+
+ return data
+
+
+def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
+ """
+ - cm : 计算出的混淆矩阵的值
+ - classes : 混淆矩阵中每一行每一列对应的列
+ - normalize : True:显示百分比, False:显示个数
+ """
+ if normalize:
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ print("显示百分比:")
+ np.set_printoptions(formatter={'float': '{: 0.2f}'.format})
+ print(cm)
+ else:
+ print('显示具体数字:')
+ print(cm)
+ plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.title(title)
+ plt.colorbar()
+ tick_marks = np.arange(len(classes))
+ plt.xticks(tick_marks, classes, rotation=90)
+ plt.yticks(tick_marks, classes)
+
+ plt.ylim(len(classes) - 0.5, -0.5)
+ fmt = '.2f' if normalize else 'd'
+ thresh = cm.max() / 2.
+ for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+ plt.text(j, i, format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="white" if cm[i, j] > thresh else "black")
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+ plt.show()
+
+
+def Train(data, decomponent=False):
+ #处理空值,无限制
+ '''
+ data = data.replace([np.inf, -np.inf], np.nan)
+ data = data.dropna(axis=0, how='any')
+ '''
+
+ print(data[' Label'].value_counts())
+ #select features
+ basic_feature = [' Label','Flow ID',' Source IP',' Source Port',' Destination IP',' Destination Port',' Protocol',' Timestamp']
+ tcp_ip_feature = ['FIN Flag Count', ' SYN Flag Count',
+ ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
+ ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count','Init_Win_bytes_forward', ' Init_Win_bytes_backward']
+ statistical_feature = [' Flow Duration', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
+ ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
+ 'Bwd Packet Length Max', ' Bwd Packet Length Min',' Bwd Packet Length Mean', ' Bwd Packet Length Std',
+ 'Fwd IAT Total',' Fwd IAT Mean', ' Fwd IAT Std',' Fwd IAT Max', ' Fwd IAT Min',
+ 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
+ ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Min Packet Length', ' Max Packet Length',
+ 'Fwd Packets/s',' Bwd Packets/s',
+ ' Flow Packets/s' ,'Flow Bytes/s']
+ middle_feature = ['pre_label','port_flow']
+ add_feature = ['cu_ipnum','cu_dip','cu_dport','cu_target']
+ x_columns = tcp_ip_feature + statistical_feature + add_feature
+ print(x_columns)
+ x = data[x_columns].values
+ x = normalize(x, axis=0, norm='max')
+ dummies = pd.get_dummies(data[' Label'])
+ #dummies = pd.get_dummies(data['label'])
+ outcomes = dummies.columns
+ print(outcomes)
+ num_classes = len(outcomes)
+ print('[traffic] 类别数:', num_classes)
+ y = dummies.values
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=20)
+
+ return x_train, y_train, x_test, y_test
+
+def SVM(train_X, train_Y, test_X, test_Y):
+ print('[SVM] train ...')
+ train_Y = [np.where(r == 1)[0][0] for r in train_Y]
+ test_Y = [np.where(r == 1)[0][0] for r in test_Y]
+ t1 = time.time()
+ clf = svm.SVC(decision_function_shape='ovr', max_iter=900, kernel='rbf')
+ model = clf.fit(train_X, train_Y)
+ y_hat = model.predict(test_X)
+ acc = accuracy_score(test_Y, y_hat)
+ t2 = time.time()
+ print('acc:', acc)
+ print('using time:', t2 - t1, 'sec')
+ matrix = sm.confusion_matrix(test_Y, y_hat)
+ print(matrix)
+ report = classification_report(test_Y, y_hat)
+ print(report)
+ print('-' * 20)
+ '''
+ import pickle
+ with open('svm.pickle','wb') as f:
+ pickle.dump(clf,f)
+ '''
+ #plot_confusion_matrix(matrix, label, True, 'SVM Confusion matrix')
+
+def Elbow_kmeans(X):
+ K = range(1, 40)
+ meandistortions = []
+ for k in K:
+ kmeans = KMeans(n_clusters=k)
+ kmeans.fit(X)
+ meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])
+ plt.plot(K, meandistortions, 'bx-')
+ plt.xlabel('k')
+ plt.ylabel('Average Dispersion')
+ plt.title('Selecting k with the Elbow Method')
+ plt.show()
+
+def Kmeans(data):
+ print(data[' Label'].value_counts())
+ basic_feature = [' Label','Flow ID',' Source IP',' Source Port',' Destination IP',' Destination Port',' Protocol',' Timestamp']
+ tcp_ip_feature = ['FIN Flag Count', ' SYN Flag Count',
+ ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
+ ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count','Init_Win_bytes_forward', ' Init_Win_bytes_backward']
+ statistical_feature = [' Flow Duration', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
+ ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
+ 'Bwd Packet Length Max', ' Bwd Packet Length Min',' Bwd Packet Length Mean', ' Bwd Packet Length Std',
+ ' Fwd IAT Mean', ' Fwd IAT Std',' Fwd IAT Max', ' Fwd IAT Min',
+ 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
+ ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Min Packet Length', ' Max Packet Length',
+ 'Fwd Packets/s',' Bwd Packets/s',
+ ' Flow Packets/s' ,'Flow Bytes/s']
+ x_columns = tcp_ip_feature + statistical_feature
+ x = data[x_columns].values
+ x = normalize(x, axis=0, norm='max')
+ #处理标签
+ dummies = pd.get_dummies(data[' Label'])
+ #dummies = pd.get_dummies(data['label'])
+ outcomes = dummies.columns
+ print(outcomes)
+ num_classes = len(outcomes)
+ print('[traffic] 类别数:', num_classes)
+ y = dummies.values
+ Y = y[:,0]
+ #聚类
+ KMEANS = KMeans(n_clusters = 30, max_iter = 300,n_init = 10,random_state = 0)
+ kmeans = KMEANS.fit(x)
+ Z = kmeans.labels_
+ inertia = KMEANS.inertia_
+ #Kmeans Results
+ kmeansR = pd.crosstab(Y,Z)
+ maxVal = kmeansR.idxmax()
+ print(kmeansR)
+ print(len(Z))
+ #Z = pso_kmeans(x,Y)
+ return Z
+
+def pso_kmeans(X,Y):
+ from pso import ParticleSwarmOptimizedClustering
+ from particle import quantization_error, calc_sse
+ from utils import normalize
+ from kmeans import KMeans
+ from sklearn.metrics import silhouette_score
+ from scipy.spatial.distance import cdist
+ distortions = []
+ K = [30]
+ for k in K:
+ pso_rep = ParticleSwarmOptimizedClustering(
+ n_cluster=k, n_particles=10, data=X, hybrid=True, max_iter=50, print_debug=2000)
+ pso_rep.run()
+ pso_kmeans = KMeans(n_cluster=k, init_pp=False, seed=2022)
+ pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
+ predicted_pso_rep = pso_kmeans.predict(X)
+
+ silhouette = silhouette_score(X, predicted_pso_rep)
+ sse = calc_sse(centroids=pso_rep.gbest_centroids, data=X, labels=predicted_pso_rep)
+ quantization = pso_rep.gbest_score
+ Z = pso_kmeans.predict(X)
+ #
+ distortions.append(sum(np.min(cdist(X, pso_kmeans.centroid, 'euclidean'), axis=1)) / X.shape[0])
+ print(pso_kmeans.centroid)
+ print(type(Z))
+ kmeansR = pd.crosstab(Y,Z)
+ maxVal = kmeansR.idxmax()
+ print(kmeansR,"\n\n")
+ #保存
+ import pickle
+ with open('train_model_sta.pkl', 'wb') as f:
+ pickle.dump(pso_kmeans, f)
+ '''
+ #加载
+ import pickle
+ pso_kmeans = pickle.load(file=open('train_model_all.pkl', 'rb'))
+ '''
+ Z = pso_kmeans.predict(X)
+ kmeansR = pd.crosstab(Y,Z)
+ maxVal = kmeansR.idxmax()
+ print(kmeansR,"\n\n")
+ return Z
+
+result_dict = {}
+def formula(x):
+ if x in result_dict:
+ return result_dict[x]
+ else:
+ return 0
+
+def process_data(data,pre_label):
+ data['pre_label'] = pre_label
+ x_columns = ['pre_label',' Destination Port']
+ label_num = data['pre_label'].unique()
+ for i in range(len(label_num)):
+ tmp_data = data.loc[data['pre_label'] == i]
+ flow_num = len(tmp_data)
+ port_num = len(tmp_data[' Destination Port'].unique())
+ tmp_result = port_num
+ result_dict[i] = tmp_result
+ print(result_dict)
+ data['port_flow'] = ''
+ data['port_flow'] = data.apply(lambda row: formula(row['pre_label']), axis=1)
+ return data
+
+def add_feature(data):
+ '''
+ 增加特征
+ '''
+ #该扫描结点(IP)所在簇连接的taget的数量
+ cu_target = {}
+ for index,row in data.iterrows():
+ if row['pre_label'] not in cu_target:
+ cu_target[row['pre_label']] = set()
+ cu_target[row['pre_label']].add(str(row[' Destination IP'])+'-'+str(row[' Destination Port']))
+ data['cu_target'] = ''
+ def formula_cu_target(x):
+ if x in cu_target:
+ return len(cu_target[x])
+ else:
+ return 0
+ data['cu_target'] = data.apply(lambda row: formula_cu_target(row['pre_label']), axis=1)
+ #该扫描结点所在簇的结点(IP)数量
+ cu_ipnum = {}
+ for index,row in data.iterrows():
+ if row['pre_label'] not in cu_ipnum:
+ cu_ipnum[row['pre_label']] = set()
+ cu_ipnum[row['pre_label']].add(row[' Source IP'])
+ data['cu_ipnum'] = ''
+ def formula_cu_ipnum(x):
+ if x in cu_ipnum:
+ return len(cu_ipnum[x])
+ else:
+ return 0
+ data['cu_ipnum'] = data.apply(lambda row: formula_cu_ipnum(row['pre_label']), axis=1)
+
+ #该扫描结点(IP)所在簇连接的目标IP的数量
+ cu_dip = {}
+ for index,row in data.iterrows():
+ if row['pre_label'] not in cu_dip:
+ cu_dip[row['pre_label']] = set()
+ cu_dip[row['pre_label']].add(str(row[' Destination IP']))
+ data['cu_dip'] = ''
+ def formula_cu_dip(x):
+ if x in cu_dip:
+ return len(cu_dip[x])
+ else:
+ return 0
+ data['cu_dip'] = data.apply(lambda row: formula_cu_dip(row['pre_label']), axis=1)
+ #该扫描结点(IP)所在簇连接的目标端口的数量
+ cu_dport = {}
+ for index,row in data.iterrows():
+ if row['pre_label'] not in cu_dport:
+ cu_dport[row['pre_label']] = set()
+ cu_dport[row['pre_label']].add(str(row[' Destination Port']))
+ data['cu_dport'] = ''
+ def formula_cu_dport(x):
+ if x in cu_dport:
+ return len(cu_dport[x])
+ else:
+ return 0
+ data['cu_dport'] = data.apply(lambda row: formula_cu_dport(row['pre_label']), axis=1)
+
+ return data
+
+def main():
+ data = HandleData('H:\dataset\CICIDS2017\pcap\CIC\cic_trafficlabeling\input\Test')
+ pre_label = Kmeans(data)
+ result_data = process_data(data,pre_label)
+ result_data = add_feature(result_data)
+ train_X, train_Y, test_X, test_Y= Train(result_data)
+ SVM(train_X, train_Y, test_X, test_Y)
+ # MLP(train_X, train_Y, test_X, test_Y)
+
+if __name__ == '__main__':
+ main()
diff --git a/KM-SVM/kmeans.py b/KM-SVM/kmeans.py
new file mode 100644
index 0000000..373d36e
--- /dev/null
+++ b/KM-SVM/kmeans.py
@@ -0,0 +1,192 @@
+"""K-Means module, contain K-Means implementation inside KMeans class
+"""
+
+import numpy
+
+
+def calc_sse(centroids: numpy.ndarray, labels: numpy.ndarray, data: numpy.ndarray):
+ distances = 0
+ for i, c in enumerate(centroids):
+ idx = numpy.where(labels == i)
+ dist = numpy.sum((data[idx] - c)**2)
+ distances += dist
+ return distances
+
+
+class KMeans:
+ """K-Means clustering algorithm
+
+ Attributes
+ ----------
+ n_cluster : int
+ Num of cluster applied to data
+ init_pp : bool
+ Initialization method whether to use K-Means++ or not
+ (the default is True, which use K-Means++)
+ max_iter : int
+ Max iteration to update centroid (the default is 300)
+ tolerance : float
+ Minimum centroid update difference value to stop iteration (the default is 1e-4)
+ seed : int
+ Seed number to use in random generator (the default is None)
+ centroid : list
+ List of centroid values
+ SSE : float
+ Sum squared error score
+ """
+
+ def __init__(
+ self,
+ n_cluster: int,
+ init_pp: bool = True,
+ max_iter: int = 300,
+ tolerance: float = 1e-4,
+ seed: int = None):
+ """Instantiate K-Means object
+
+ Parameters
+ ----------
+ n_cluster : int
+ Num of cluster applied to data
+ init_pp : bool, optional
+ Initialization method whether to use K-Means++ or not
+ (the default is True, which use K-Means++)
+ max_iter : int, optional
+ Max iteration to update centroid (the default is 100)
+ tolerance : float, optional
+ Minimum centroid update difference value to stop iteration (the default is 1e-4)
+ seed : int, optional
+ Seed number to use in random generator (the default is None)
+ """
+
+ self.n_cluster = n_cluster
+ self.max_iter = max_iter
+ self.tolerance = tolerance
+ self.init_pp = init_pp
+ self.seed = seed
+ self.centroid = None
+ self.SSE = None
+
+ def fit(self, data: numpy.ndarray):
+ """Fit K-Means algorithm to given data
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data matrix to be fitted
+
+ """
+ self.centroid = self._init_centroid(data)
+ for _ in range(self.max_iter):
+ distance = self._calc_distance(data)
+ cluster = self._assign_cluster(distance)
+ new_centroid = self._update_centroid(data, cluster)
+ diff = numpy.abs(self.centroid - new_centroid).mean()
+ self.centroid = new_centroid
+
+ if diff <= self.tolerance:
+ break
+
+ self.SSE = calc_sse(self.centroid, cluster, data)
+
+ def predict(self, data: numpy.ndarray):
+ """Predict new data's cluster using minimum distance to centroid
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ New data to be predicted
+
+ """
+ distance = self._calc_distance(data)
+ # print(distance.shape)
+ cluster = self._assign_cluster(distance)
+ # print(cluster.shape)
+ return cluster
+
+ def _init_centroid(self, data: numpy.ndarray):
+ """Initialize centroid using random method or KMeans++
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data matrix to sample from
+
+ """
+ if self.init_pp:
+ numpy.random.seed(self.seed)
+ centroid = [int(numpy.random.uniform()*len(data))]
+ for _ in range(1, self.n_cluster):
+ dist = []
+ dist = [min([numpy.inner(data[c]-x, data[c]-x) for c in centroid])
+ for i, x in enumerate(data)]
+ dist = numpy.array(dist)
+ dist = dist / dist.sum()
+ cumdist = numpy.cumsum(dist)
+
+ prob = numpy.random.rand()
+ for i, c in enumerate(cumdist):
+ if prob > c and i not in centroid:
+ centroid.append(i)
+ break
+ centroid = numpy.array([data[c] for c in centroid])
+ else:
+ numpy.random.seed(self.seed)
+ idx = numpy.random.choice(range(len(data)), size=(self.n_cluster))
+ centroid = data[idx]
+ # print(centroid)
+ return centroid
+
+ def _calc_distance(self, data: numpy.ndarray):
+ """Calculate distance between data and centroids
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data which distance to be calculated
+
+ """
+ distances = []
+ for c in self.centroid:
+ distance = numpy.sum((data - c) * (data - c), axis=1)
+ distances.append(distance)
+
+ distances = numpy.array(distances)
+ distances = distances.T
+ return distances
+
+ def _assign_cluster(self, distance: numpy.ndarray):
+ """Assign cluster to data based on minimum distance to centroids
+
+ Parameters
+ ----------
+ distance : numpy.ndarray
+ Distance from each data to each centroid
+
+ """
+ cluster = numpy.argmin(distance, axis=1)
+ return cluster
+
+ def _update_centroid(self, data: numpy.ndarray, cluster: numpy.ndarray):
+ """Update centroid from means of each cluster's data
+
+ Parameters
+ ----------
+ data : numpy.ndarray
+ Data matrix to get mean from
+ cluster : numpy.ndarray
+ Cluster label for each data
+
+ """
+ centroids = []
+ for i in range(self.n_cluster):
+ idx = numpy.where(cluster == i)
+ centroid = numpy.mean(data[idx], axis=0)
+ centroids.append(centroid)
+ centroids = numpy.array(centroids)
+ return centroids
+
+
+if __name__ == "__main__":
+
+ pass
diff --git a/KM-SVM/particle.py b/KM-SVM/particle.py
new file mode 100644
index 0000000..4ba3da4
--- /dev/null
+++ b/KM-SVM/particle.py
@@ -0,0 +1,103 @@
+"""Particle component for Particle Swarm Oprimization technique
+"""
+
+import numpy as np
+
+from kmeans import KMeans, calc_sse
+
+
+def quantization_error(centroids: np.ndarray, labels: np.ndarray, data: np.ndarray) -> float:
+ error = 0.0
+ for i, c in enumerate(centroids):
+ idx = np.where(labels == i)[0]
+ dist = np.linalg.norm(data[idx] - c, axis=1).sum()
+ dist /= len(idx)
+ error += dist
+ error /= len(centroids)
+ return error
+
+
+class Particle:
+ """[summary]
+
+ """
+
+ def __init__(self,
+ n_cluster: int,
+ data: np.ndarray,
+ use_kmeans: bool = False,
+ w: float = 0.9,
+ c1: float = 0.5,
+ c2: float = 0.3):
+ index = np.random.choice(list(range(len(data))), n_cluster)
+ self.centroids = data[index].copy()
+ if use_kmeans:
+ kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
+ kmeans.fit(data)
+ self.centroids = kmeans.centroid.copy()
+ self.best_position = self.centroids.copy()
+ self.best_score = quantization_error(self.centroids, self._predict(data), data)
+ self.best_sse = calc_sse(self.centroids, self._predict(data), data)
+ self.velocity = np.zeros_like(self.centroids)
+ self._w = w
+ self._c1 = c1
+ self._c2 = c2
+
+ def update(self, gbest_position: np.ndarray, data: np.ndarray):
+ """Update particle's velocity and centroids
+
+ Parameters
+ ----------
+ gbest_position : np.ndarray
+ data : np.ndarray
+
+ """
+ self._update_velocity(gbest_position)
+ self._update_centroids(data)
+
+ def _update_velocity(self, gbest_position: np.ndarray):
+ """Update velocity based on old value, cognitive component, and social component
+ """
+
+ v_old = self._w * self.velocity
+ cognitive_component = self._c1 * np.random.random() * (self.best_position - self.centroids)
+ social_component = self._c2 * np.random.random() * (gbest_position - self.centroids)
+ self.velocity = v_old + cognitive_component + social_component
+
+ def _update_centroids(self, data: np.ndarray):
+ self.centroids = self.centroids + self.velocity
+ new_score = quantization_error(self.centroids, self._predict(data), data)
+ sse = calc_sse(self.centroids, self._predict(data), data)
+ self.best_sse = min(sse, self.best_sse)
+ if new_score < self.best_score:
+ self.best_score = new_score
+ self.best_position = self.centroids.copy()
+
+ def _predict(self, data: np.ndarray) -> np.ndarray:
+ """Predict new data's cluster using minimum distance to centroid
+ """
+ distance = self._calc_distance(data)
+ cluster = self._assign_cluster(distance)
+ return cluster
+
+ def _calc_distance(self, data: np.ndarray) -> np.ndarray:
+ """Calculate distance between data and centroids
+ """
+ distances = []
+ for c in self.centroids:
+ distance = np.sum((data - c) * (data - c), axis=1)
+ distances.append(distance)
+
+ distances = np.array(distances)
+ distances = np.transpose(distances)
+ return distances
+
+ def _assign_cluster(self, distance: np.ndarray) -> np.ndarray:
+ """Assign cluster to data based on minimum distance to centroids
+ """
+ cluster = np.argmin(distance, axis=1)
+ return cluster
+
+
+if __name__ == "__main__":
+ pass
diff --git a/KM-SVM/pso.py b/KM-SVM/pso.py
new file mode 100644
index 0000000..81645e5
--- /dev/null
+++ b/KM-SVM/pso.py
@@ -0,0 +1,63 @@
+"""Particle Swarm Optimized Clustering
+Optimizing centroid using K-Means style. In hybrid mode will use K-Means to seed first particle's centroid
+"""
+import numpy as np
+
+from particle import Particle
+
+
+class ParticleSwarmOptimizedClustering:
+ def __init__(self,
+ n_cluster: int,
+ n_particles: int,
+ data: np.ndarray,
+ hybrid: bool = True,
+ max_iter: int = 100,
+ print_debug: int = 10):
+ self.n_cluster = n_cluster
+ self.n_particles = n_particles
+ self.data = data
+ self.max_iter = max_iter
+ self.particles = []
+ self.hybrid = hybrid
+
+ self.print_debug = print_debug
+ self.gbest_score = np.inf
+ self.gbest_centroids = None
+ self.gbest_sse = np.inf
+ self._init_particles()
+
+ def _init_particles(self):
+ for i in range(self.n_particles):
+ particle = None
+ if i == 0 and self.hybrid:
+ particle = Particle(self.n_cluster, self.data, use_kmeans=True)
+ else:
+ particle = Particle(self.n_cluster, self.data, use_kmeans=False)
+ if particle.best_score < self.gbest_score:
+ self.gbest_centroids = particle.centroids.copy()
+ self.gbest_score = particle.best_score
+ self.particles.append(particle)
+ self.gbest_sse = min(particle.best_sse, self.gbest_sse)
+
+ def run(self):
+ print('Initial global best score', self.gbest_score)
+ history = []
+ for i in range(self.max_iter):
+ for particle in self.particles:
+ particle.update(self.gbest_centroids, self.data)
+ #print(i, particle.best_score, self.gbest_score)
+ for particle in self.particles:
+ if particle.best_score < self.gbest_score:
+ self.gbest_centroids = particle.centroids.copy()
+ self.gbest_score = particle.best_score
+ history.append(self.gbest_score)
+ if i % self.print_debug == 0:
+ print('Iteration {:04d}/{:04d} current gbest score {:.18f}'.format(
+ i + 1, self.max_iter, self.gbest_score))
+ print('Finish with gbest score {:.18f}'.format(self.gbest_score))
+ return history
+
+
+if __name__ == "__main__":
+ pass \ No newline at end of file
diff --git a/KM-SVM/readme.md b/KM-SVM/readme.md
new file mode 100644
index 0000000..616f87b
--- /dev/null
+++ b/KM-SVM/readme.md
@@ -0,0 +1 @@
+运行KM-SVM.py使用 \ No newline at end of file