23-0625

author: Jinghua <[email protected]> 2023-06-25 00:09:54 +0800
committer: Jinghua <[email protected]> 2023-06-25 00:09:54 +0800
commit: 834734a03e00945fc5f4355ad81a76c8d607d4d0 (patch)
tree: 0b74ba445466e3923f919a67d3b0624b33afbe82
parent: 52db9070111000eea934711deec44f86f4784651 (diff)
5 files changed, 685 insertions, 0 deletions
diff --git a/KM-SVM/KM-SVM.py b/KM-SVM/KM-SVM.py
new file mode 100644
index 0000000..9c4cc77
--- /dev/null
+++ b/KM-SVM/KM-SVM.py
@@ -0,0 +1,326 @@
+import itertools
+import time
+from sklearn.preprocessing import normalize
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import train_test_split
+from sklearn import svm
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import sklearn.metrics as sm
+from scipy.spatial.distance import cdist
+
+label = ['BENIGN', 'Bot', 'DDos', 'GlodenEye', 'Dos Hulk',
+         'Slowhttp', 'SSH', 'FTP', 'PortScan', 'slowloris', 'BruteForce', 'XSS']
+
+
+def HandleData(path):
+    list_dir = os.listdir(path)
+    fd_data = []
+    for it in list_dir:
+        data = pd.read_csv(path + '/' + it)
+        fd_data.append(data)
+    data = pd.concat([fd_data[0], fd_data[1]])
+    for it in range(2, len(fd_data)):
+        data = pd.concat([data, fd_data[it]])
+    data = data.dropna(axis=0, how='any')
+    data = data.replace(',,', np.nan, inplace=False)
+    data.replace("Infinity", 0, inplace=True)
+
+    data.replace('Infinity', 0.0, inplace=True)
+    data.replace('NaN', 0.0, inplace=True)
+    data = data.replace([np.inf, -np.inf], np.nan)
+    data = data.dropna(axis=0, how='any')
+    n_row, n_col = data.shape
+    print('row:', n_row, 'col:', n_col)
+
+    return data
+
+
+def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
+    """
+    - cm : 计算出的混淆矩阵的值
+    - classes : 混淆矩阵中每一行每一列对应的列
+    - normalize : True:显示百分比, False:显示个数
+    """
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("显示百分比：")
+        np.set_printoptions(formatter={'float': '{: 0.2f}'.format})
+        print(cm)
+    else:
+        print('显示具体数字：')
+        print(cm)
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=90)
+    plt.yticks(tick_marks, classes)
+
+    plt.ylim(len(classes) - 0.5, -0.5)
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt),
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+    plt.show()
+
+
+def Train(data, decomponent=False):
+    #处理空值，无限制
+    '''
+    data = data.replace([np.inf, -np.inf], np.nan)
+    data = data.dropna(axis=0, how='any')
+    '''
+    
+    print(data[' Label'].value_counts())
+    #select features
+    basic_feature = [' Label','Flow ID',' Source IP',' Source Port',' Destination IP',' Destination Port',' Protocol',' Timestamp']
+    tcp_ip_feature = ['FIN Flag Count', ' SYN Flag Count',
+                ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
+                ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count','Init_Win_bytes_forward', ' Init_Win_bytes_backward']
+    statistical_feature = [' Flow Duration',  'Total Length of Fwd Packets', ' Total Length of Bwd Packets',    
+                            ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
+                            'Bwd Packet Length Max', ' Bwd Packet Length Min',' Bwd Packet Length Mean', ' Bwd Packet Length Std', 
+                            'Fwd IAT Total',' Fwd IAT Mean', ' Fwd IAT Std',' Fwd IAT Max', ' Fwd IAT Min', 
+                            'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
+                            ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Min Packet Length', ' Max Packet Length',
+                            'Fwd Packets/s',' Bwd Packets/s',
+                            ' Flow Packets/s' ,'Flow Bytes/s']
+    middle_feature = ['pre_label','port_flow']
+    add_feature = ['cu_ipnum','cu_dip','cu_dport','cu_target']
+    x_columns = tcp_ip_feature + statistical_feature + add_feature  
+    print(x_columns)
+    x = data[x_columns].values
+    x = normalize(x, axis=0, norm='max')
+    dummies = pd.get_dummies(data[' Label'])
+    #dummies = pd.get_dummies(data['label'])
+    outcomes = dummies.columns
+    print(outcomes)
+    num_classes = len(outcomes)
+    print('[traffic] 类别数:', num_classes)
+    y = dummies.values
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=20)
+
+    return x_train, y_train, x_test, y_test
+
+def SVM(train_X, train_Y, test_X, test_Y):
+    print('[SVM] train ...')
+    train_Y = [np.where(r == 1)[0][0] for r in train_Y]
+    test_Y = [np.where(r == 1)[0][0] for r in test_Y]
+    t1 = time.time()
+    clf = svm.SVC(decision_function_shape='ovr', max_iter=900, kernel='rbf')
+    model = clf.fit(train_X, train_Y)
+    y_hat = model.predict(test_X)
+    acc = accuracy_score(test_Y, y_hat)
+    t2 = time.time()
+    print('acc:', acc)
+    print('using time:', t2 - t1, 'sec')
+    matrix = sm.confusion_matrix(test_Y, y_hat)
+    print(matrix)
+    report = classification_report(test_Y, y_hat)
+    print(report)
+    print('-' * 20)
+    '''
+    import pickle
+    with open('svm.pickle','wb') as f:
+        pickle.dump(clf,f)
+    '''
+    #plot_confusion_matrix(matrix, label, True, 'SVM Confusion matrix')
+
+def Elbow_kmeans(X):
+    K = range(1, 40)
+    meandistortions = []
+    for k in K:
+        kmeans = KMeans(n_clusters=k)
+        kmeans.fit(X)
+        meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])
+    plt.plot(K, meandistortions, 'bx-')
+    plt.xlabel('k')
+    plt.ylabel('Average Dispersion')
+    plt.title('Selecting k with the Elbow Method')
+    plt.show()
+
+def Kmeans(data):        
+    print(data[' Label'].value_counts())
+    basic_feature = [' Label','Flow ID',' Source IP',' Source Port',' Destination IP',' Destination Port',' Protocol',' Timestamp']
+    tcp_ip_feature = ['FIN Flag Count', ' SYN Flag Count',
+                ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
+                ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count','Init_Win_bytes_forward', ' Init_Win_bytes_backward']
+    statistical_feature = [' Flow Duration',  'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
+                            ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
+                            'Bwd Packet Length Max', ' Bwd Packet Length Min',' Bwd Packet Length Mean', ' Bwd Packet Length Std',
+                            ' Fwd IAT Mean', ' Fwd IAT Std',' Fwd IAT Max', ' Fwd IAT Min',
+                            'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
+                            ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Min Packet Length', ' Max Packet Length',
+                            'Fwd Packets/s',' Bwd Packets/s',
+                            ' Flow Packets/s' ,'Flow Bytes/s']
+    x_columns = tcp_ip_feature + statistical_feature 
+    x = data[x_columns].values
+    x = normalize(x, axis=0, norm='max')
+    #处理标签
+    dummies = pd.get_dummies(data[' Label'])
+    #dummies = pd.get_dummies(data['label'])
+    outcomes = dummies.columns
+    print(outcomes)
+    num_classes = len(outcomes)
+    print('[traffic] 类别数:', num_classes)
+    y = dummies.values
+    Y = y[:,0]
+    #聚类
+    KMEANS = KMeans(n_clusters = 30, max_iter = 300,n_init = 10,random_state = 0)
+    kmeans = KMEANS.fit(x)
+    Z = kmeans.labels_
+    inertia = KMEANS.inertia_
+    #Kmeans Results
+    kmeansR = pd.crosstab(Y,Z)
+    maxVal = kmeansR.idxmax()
+    print(kmeansR)
+    print(len(Z))
+    #Z = pso_kmeans(x,Y)
+    return Z
+
+def pso_kmeans(X,Y):
+    from pso import ParticleSwarmOptimizedClustering
+    from particle import quantization_error, calc_sse
+    from utils import normalize
+    from kmeans import KMeans
+    from sklearn.metrics import silhouette_score
+    from scipy.spatial.distance import cdist
+    distortions = []
+    K = [30]
+    for k in K: 
+        pso_rep = ParticleSwarmOptimizedClustering(
+        n_cluster=k, n_particles=10, data=X, hybrid=True, max_iter=50, print_debug=2000)
+        pso_rep.run()
+        pso_kmeans = KMeans(n_cluster=k, init_pp=False, seed=2022)
+        pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
+        predicted_pso_rep = pso_kmeans.predict(X)
+    
+        silhouette = silhouette_score(X, predicted_pso_rep)
+        sse = calc_sse(centroids=pso_rep.gbest_centroids, data=X, labels=predicted_pso_rep)
+        quantization = pso_rep.gbest_score
+        Z = pso_kmeans.predict(X)
+        #
+        distortions.append(sum(np.min(cdist(X, pso_kmeans.centroid, 'euclidean'), axis=1)) / X.shape[0])
+        print(pso_kmeans.centroid)
+        print(type(Z))
+        kmeansR = pd.crosstab(Y,Z)
+        maxVal = kmeansR.idxmax()
+        print(kmeansR,"\n\n")
+    #保存
+    import pickle
+    with open('train_model_sta.pkl', 'wb') as f:
+        pickle.dump(pso_kmeans, f)
+    '''
+    #加载
+    import pickle
+    pso_kmeans =  pickle.load(file=open('train_model_all.pkl', 'rb'))
+    '''
+    Z = pso_kmeans.predict(X)
+    kmeansR = pd.crosstab(Y,Z)
+    maxVal = kmeansR.idxmax()
+    print(kmeansR,"\n\n")
+    return Z
+
+result_dict = {}
+def formula(x):
+    if x in result_dict:
+        return result_dict[x]
+    else:
+        return 0
+
+def process_data(data,pre_label):
+    data['pre_label'] = pre_label
+    x_columns = ['pre_label',' Destination Port']
+    label_num = data['pre_label'].unique()
+    for i in range(len(label_num)):
+        tmp_data = data.loc[data['pre_label'] == i]
+        flow_num = len(tmp_data)
+        port_num = len(tmp_data[' Destination Port'].unique())
+        tmp_result = port_num
+        result_dict[i] = tmp_result
+    print(result_dict)
+    data['port_flow'] = ''
+    data['port_flow'] = data.apply(lambda row: formula(row['pre_label']), axis=1)
+    return data
+
+def add_feature(data):
+    '''
+    增加特征
+    '''    
+    #该扫描结点（IP)所在簇连接的taget的数量
+    cu_target = {}
+    for index,row in data.iterrows():
+        if row['pre_label'] not in cu_target:
+            cu_target[row['pre_label']] = set()            
+        cu_target[row['pre_label']].add(str(row[' Destination IP'])+'-'+str(row[' Destination Port']))
+    data['cu_target'] = ''
+    def formula_cu_target(x):
+        if x in cu_target:
+            return len(cu_target[x])
+        else:
+            return 0
+    data['cu_target'] = data.apply(lambda row: formula_cu_target(row['pre_label']), axis=1)
+    #该扫描结点所在簇的结点（IP)数量
+    cu_ipnum = {}
+    for index,row in data.iterrows():
+        if row['pre_label'] not in cu_ipnum:
+            cu_ipnum[row['pre_label']] = set()            
+        cu_ipnum[row['pre_label']].add(row[' Source IP'])
+    data['cu_ipnum'] = ''
+    def formula_cu_ipnum(x):
+        if x in cu_ipnum:
+            return len(cu_ipnum[x])
+        else:
+            return 0
+    data['cu_ipnum'] = data.apply(lambda row: formula_cu_ipnum(row['pre_label']), axis=1)
+    
+    #该扫描结点（IP)所在簇连接的目标IP的数量
+    cu_dip = {}
+    for index,row in data.iterrows():
+        if row['pre_label'] not in cu_dip:
+            cu_dip[row['pre_label']] = set()            
+        cu_dip[row['pre_label']].add(str(row[' Destination IP']))
+    data['cu_dip'] = ''
+    def formula_cu_dip(x):
+        if x in cu_dip:
+            return len(cu_dip[x])
+        else:
+            return 0
+    data['cu_dip'] = data.apply(lambda row: formula_cu_dip(row['pre_label']), axis=1)
+    #该扫描结点（IP)所在簇连接的目标端口的数量
+    cu_dport = {}
+    for index,row in data.iterrows():
+        if row['pre_label'] not in cu_dport:
+            cu_dport[row['pre_label']] = set()            
+        cu_dport[row['pre_label']].add(str(row[' Destination Port']))
+    data['cu_dport'] = ''
+    def formula_cu_dport(x):
+        if x in cu_dport:
+            return len(cu_dport[x])
+        else:
+            return 0
+    data['cu_dport'] = data.apply(lambda row: formula_cu_dport(row['pre_label']), axis=1)
+
+    return data
+
+def main():
+    data = HandleData('H:\dataset\CICIDS2017\pcap\CIC\cic_trafficlabeling\input\Test')
+    pre_label = Kmeans(data)
+    result_data = process_data(data,pre_label)
+    result_data = add_feature(result_data)
+    train_X, train_Y, test_X, test_Y= Train(result_data)
+    SVM(train_X, train_Y, test_X, test_Y)
+    # MLP(train_X, train_Y, test_X, test_Y)
+
+if __name__ == '__main__':
+    main()
diff --git a/KM-SVM/kmeans.py b/KM-SVM/kmeans.py
new file mode 100644
index 0000000..373d36e
--- /dev/null
+++ b/KM-SVM/kmeans.py
@@ -0,0 +1,192 @@
+"""K-Means module, contain K-Means implementation inside KMeans class
+"""
+
+import numpy
+
+
+def calc_sse(centroids: numpy.ndarray, labels: numpy.ndarray, data: numpy.ndarray):
+    distances = 0
+    for i, c in enumerate(centroids):
+        idx = numpy.where(labels == i)
+        dist = numpy.sum((data[idx] - c)**2)
+        distances += dist
+    return distances
+
+
+class KMeans:
+    """K-Means clustering algorithm
+
+        Attributes
+        ----------
+        n_cluster : int
+            Num of cluster applied to data
+        init_pp : bool
+            Initialization method whether to use K-Means++ or not
+            (the default is True, which use K-Means++)
+        max_iter : int
+            Max iteration to update centroid (the default is 300)
+        tolerance : float
+            Minimum centroid update difference value to stop iteration (the default is 1e-4)
+        seed : int
+            Seed number to use in random generator (the default is None)
+        centroid : list
+            List of centroid values
+        SSE : float
+            Sum squared error score
+    """
+
+    def __init__(
+            self,
+            n_cluster: int,
+            init_pp: bool = True,
+            max_iter: int = 300,
+            tolerance: float = 1e-4,
+            seed: int = None):
+        """Instantiate K-Means object
+
+        Parameters
+        ----------
+        n_cluster : int
+            Num of cluster applied to data
+        init_pp : bool, optional
+            Initialization method whether to use K-Means++ or not
+            (the default is True, which use K-Means++)
+        max_iter : int, optional
+            Max iteration to update centroid (the default is 100)
+        tolerance : float, optional
+            Minimum centroid update difference value to stop iteration (the default is 1e-4)
+        seed : int, optional
+            Seed number to use in random generator (the default is None)
+        """
+
+        self.n_cluster = n_cluster
+        self.max_iter = max_iter
+        self.tolerance = tolerance
+        self.init_pp = init_pp
+        self.seed = seed
+        self.centroid = None
+        self.SSE = None
+
+    def fit(self, data: numpy.ndarray):
+        """Fit K-Means algorithm to given data
+
+        Parameters
+        ----------
+        data : numpy.ndarray
+            Data matrix to be fitted
+
+        """
+        self.centroid = self._init_centroid(data)
+        for _ in range(self.max_iter):
+            distance = self._calc_distance(data)
+            cluster = self._assign_cluster(distance)
+            new_centroid = self._update_centroid(data, cluster)
+            diff = numpy.abs(self.centroid - new_centroid).mean()
+            self.centroid = new_centroid
+
+            if diff <= self.tolerance:
+                break
+
+        self.SSE = calc_sse(self.centroid, cluster, data)
+
+    def predict(self, data: numpy.ndarray):
+        """Predict new data's cluster using minimum distance to centroid
+
+        Parameters
+        ----------
+        data : numpy.ndarray
+            New data to be predicted
+
+        """
+        distance = self._calc_distance(data)
+        # print(distance.shape)
+        cluster = self._assign_cluster(distance)
+        # print(cluster.shape)
+        return cluster
+
+    def _init_centroid(self, data: numpy.ndarray):
+        """Initialize centroid using random method or KMeans++
+
+        Parameters
+        ----------
+        data : numpy.ndarray
+            Data matrix to sample from
+
+        """
+        if self.init_pp:
+            numpy.random.seed(self.seed)
+            centroid = [int(numpy.random.uniform()*len(data))]
+            for _ in range(1, self.n_cluster):
+                dist = []
+                dist = [min([numpy.inner(data[c]-x, data[c]-x) for c in centroid])
+                        for i, x in enumerate(data)]
+                dist = numpy.array(dist)
+                dist = dist / dist.sum()
+                cumdist = numpy.cumsum(dist)
+
+                prob = numpy.random.rand()
+                for i, c in enumerate(cumdist):
+                    if prob > c and i not in centroid:
+                        centroid.append(i)
+                        break
+            centroid = numpy.array([data[c] for c in centroid])
+        else:
+            numpy.random.seed(self.seed)
+            idx = numpy.random.choice(range(len(data)), size=(self.n_cluster))
+            centroid = data[idx]
+        # print(centroid)
+        return centroid
+
+    def _calc_distance(self, data: numpy.ndarray):
+        """Calculate distance between data and centroids
+
+        Parameters
+        ----------
+        data : numpy.ndarray
+            Data which distance to be calculated
+
+        """
+        distances = []
+        for c in self.centroid:
+            distance = numpy.sum((data - c) * (data - c), axis=1)
+            distances.append(distance)
+
+        distances = numpy.array(distances)
+        distances = distances.T
+        return distances
+
+    def _assign_cluster(self, distance: numpy.ndarray):
+        """Assign cluster to data based on minimum distance to centroids
+
+        Parameters
+        ----------
+        distance : numpy.ndarray
+            Distance from each data to each centroid
+
+        """
+        cluster = numpy.argmin(distance, axis=1)
+        return cluster
+
+    def _update_centroid(self, data: numpy.ndarray, cluster: numpy.ndarray):
+        """Update centroid from means of each cluster's data
+
+        Parameters
+        ----------
+        data : numpy.ndarray
+            Data matrix to get mean from
+        cluster : numpy.ndarray
+            Cluster label for each data
+
+        """
+        centroids = []
+        for i in range(self.n_cluster):
+            idx = numpy.where(cluster == i)
+            centroid = numpy.mean(data[idx], axis=0)
+            centroids.append(centroid)
+        centroids = numpy.array(centroids)
+        return centroids
+
+
+if __name__ == "__main__":
+
+    pass
diff --git a/KM-SVM/particle.py b/KM-SVM/particle.py
new file mode 100644
index 0000000..4ba3da4
--- /dev/null
+++ b/KM-SVM/particle.py
@@ -0,0 +1,103 @@
+"""Particle component for Particle Swarm Oprimization technique
+"""
+
+import numpy as np
+
+from kmeans import KMeans, calc_sse
+
+
+def quantization_error(centroids: np.ndarray, labels: np.ndarray, data: np.ndarray) -> float:
+    error = 0.0
+    for i, c in enumerate(centroids):
+        idx = np.where(labels == i)[0]
+        dist = np.linalg.norm(data[idx] - c, axis=1).sum()
+        dist /= len(idx)
+        error += dist
+    error /= len(centroids)
+    return error
+
+
+class Particle:
+    """[summary]
+
+    """
+
+    def __init__(self,
+                 n_cluster: int,
+                 data: np.ndarray,
+                 use_kmeans: bool = False,
+                 w: float = 0.9,
+                 c1: float = 0.5,
+                 c2: float = 0.3):
+        index = np.random.choice(list(range(len(data))), n_cluster)
+        self.centroids = data[index].copy()
+        if use_kmeans:
+            kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
+            kmeans.fit(data)
+            self.centroids = kmeans.centroid.copy()
+        self.best_position = self.centroids.copy()
+        self.best_score = quantization_error(self.centroids, self._predict(data), data)
+        self.best_sse = calc_sse(self.centroids, self._predict(data), data)
+        self.velocity = np.zeros_like(self.centroids)
+        self._w = w
+        self._c1 = c1
+        self._c2 = c2
+
+    def update(self, gbest_position: np.ndarray, data: np.ndarray):
+        """Update particle's velocity and centroids
+        
+        Parameters
+        ----------
+        gbest_position : np.ndarray
+        data : np.ndarray
+        
+        """
+        self._update_velocity(gbest_position)
+        self._update_centroids(data)
+
+    def _update_velocity(self, gbest_position: np.ndarray):
+        """Update velocity based on old value, cognitive component, and social component
+        """
+
+        v_old = self._w * self.velocity
+        cognitive_component = self._c1 * np.random.random() * (self.best_position - self.centroids)
+        social_component = self._c2 * np.random.random() * (gbest_position - self.centroids)
+        self.velocity = v_old + cognitive_component + social_component
+
+    def _update_centroids(self, data: np.ndarray):
+        self.centroids = self.centroids + self.velocity
+        new_score = quantization_error(self.centroids, self._predict(data), data)
+        sse = calc_sse(self.centroids, self._predict(data), data)
+        self.best_sse = min(sse, self.best_sse)
+        if new_score < self.best_score:
+            self.best_score = new_score
+            self.best_position = self.centroids.copy()
+
+    def _predict(self, data: np.ndarray) -> np.ndarray:
+        """Predict new data's cluster using minimum distance to centroid
+        """
+        distance = self._calc_distance(data)
+        cluster = self._assign_cluster(distance)
+        return cluster
+
+    def _calc_distance(self, data: np.ndarray) -> np.ndarray:
+        """Calculate distance between data and centroids
+        """
+        distances = []
+        for c in self.centroids:
+            distance = np.sum((data - c) * (data - c), axis=1)
+            distances.append(distance)
+
+        distances = np.array(distances)
+        distances = np.transpose(distances)
+        return distances
+
+    def _assign_cluster(self, distance: np.ndarray) -> np.ndarray:
+        """Assign cluster to data based on minimum distance to centroids
+        """
+        cluster = np.argmin(distance, axis=1)
+        return cluster
+
+
+if __name__ == "__main__":
+    pass
diff --git a/KM-SVM/pso.py b/KM-SVM/pso.py
new file mode 100644
index 0000000..81645e5
--- /dev/null
+++ b/KM-SVM/pso.py
@@ -0,0 +1,63 @@
+"""Particle Swarm Optimized Clustering
+Optimizing centroid using K-Means style. In hybrid mode will use K-Means to seed first particle's centroid
+"""
+import numpy as np
+
+from particle import Particle
+
+
+class ParticleSwarmOptimizedClustering:
+    def __init__(self,
+                 n_cluster: int,
+                 n_particles: int,
+                 data: np.ndarray,
+                 hybrid: bool = True,
+                 max_iter: int = 100,
+                 print_debug: int = 10):
+        self.n_cluster = n_cluster
+        self.n_particles = n_particles
+        self.data = data
+        self.max_iter = max_iter
+        self.particles = []
+        self.hybrid = hybrid
+
+        self.print_debug = print_debug
+        self.gbest_score = np.inf
+        self.gbest_centroids = None
+        self.gbest_sse = np.inf
+        self._init_particles()
+
+    def _init_particles(self):
+        for i in range(self.n_particles):
+            particle = None
+            if i == 0 and self.hybrid:
+                particle = Particle(self.n_cluster, self.data, use_kmeans=True)
+            else:
+                particle = Particle(self.n_cluster, self.data, use_kmeans=False)
+            if particle.best_score < self.gbest_score:
+                self.gbest_centroids = particle.centroids.copy()
+                self.gbest_score = particle.best_score
+            self.particles.append(particle)
+            self.gbest_sse = min(particle.best_sse, self.gbest_sse)
+
+    def run(self):
+        print('Initial global best score', self.gbest_score)
+        history = []
+        for i in range(self.max_iter):
+            for particle in self.particles:
+                particle.update(self.gbest_centroids, self.data)
+                #print(i, particle.best_score, self.gbest_score)
+            for particle in self.particles:
+                if particle.best_score < self.gbest_score:
+                    self.gbest_centroids = particle.centroids.copy()
+                    self.gbest_score = particle.best_score
+            history.append(self.gbest_score)
+            if i % self.print_debug == 0:
+                print('Iteration {:04d}/{:04d} current gbest score {:.18f}'.format(
+                    i + 1, self.max_iter, self.gbest_score))
+        print('Finish with gbest score {:.18f}'.format(self.gbest_score))
+        return history
+
+
+if __name__ == "__main__":
+    pass
+\ No newline at end of file
diff --git a/KM-SVM/readme.md b/KM-SVM/readme.md
new file mode 100644
index 0000000..616f87b
--- /dev/null
+++ b/KM-SVM/readme.md
@@ -0,0 +1 @@
+运行KM-SVM.py使用
+\ No newline at end of file
author	Jinghua <[email protected]>	2023-06-25 00:09:54 +0800
committer	Jinghua <[email protected]>	2023-06-25 00:09:54 +0800
commit	834734a03e00945fc5f4355ad81a76c8d607d4d0 (patch)
tree	0b74ba445466e3923f919a67d3b0624b33afbe82
parent	52db9070111000eea934711deec44f86f4784651 (diff)