“first”HEAD main

author: liliqing <[email protected]> 2023-06-26 00:51:47 +0800
committer: liliqing <[email protected]> 2023-06-26 00:51:47 +0800
commit: 623ad2089d6cfc06b0324ce00c4cb5cf4f0db6a7 (patch)
tree: fa4d85096a6c973698562e20a37fbfd2c1b3163e /p_columbus
parent: 50abdae2c22a190fef9afcb0c66791b514a2709f (diff)
6 files changed, 167 insertions, 0 deletions
diff --git a/p_columbus/__pycache__/applib.cpython-37.pyc b/p_columbus/__pycache__/applib.cpython-37.pyc
new file mode 100644
index 0000000..8644c3c
--- /dev/null
+++ b/p_columbus/__pycache__/applib.cpython-37.pyc
diff --git a/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc
new file mode 100644
index 0000000..201f938
--- /dev/null
+++ b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc
diff --git a/p_columbus/__pycache__/p_columbus.cpython-37.pyc b/p_columbus/__pycache__/p_columbus.cpython-37.pyc
new file mode 100644
index 0000000..c69bd15
--- /dev/null
+++ b/p_columbus/__pycache__/p_columbus.cpython-37.pyc
diff --git a/p_columbus/applib.py b/p_columbus/applib.py
new file mode 100644
index 0000000..b8e9bac
--- /dev/null
+++ b/p_columbus/applib.py
@@ -0,0 +1 @@
+applabels=['ansible', 'apache2', 'arpwatch', 'bacula-client', 'bind9', 'cobertura', 'cockroachdb', 'containerd', 'coredns', 'crda', 'curl', 'dcraw', 'deno', 'docker', 'docker.io', 'dovecot-core', 'dstat', 'emacs-nox', 'etcd', 'finger', 'flutter', 'freeradius', 'go', 'haproxy', 'iotop', 'iptraf-ng', 'kafka', 'ksh', 'kubernetes', 'lftp', 'lighttpd', 'logwatch', 'lshw', 'lynx', 'marisa', 'memcached', 'mgetty-voice', 'mtr', 'mutt', 'nginx', 'nmap', 'nodejs', 'openjdk-8-jdk', 'orientdb', 'pax', 'postgresql', 'powertop', 'prometheus', 'puppet', 'python3', 'qemu-kvm', 'quagga', 'quota', 'rabbitmq-server', 'rdist', 'redis', 'rocketmq', 'sane', 'smartmontools', 'squid', 'subversion', 'supermin', 'sysstat', 'thrift', 'tomcat', 'traceroute', 'tree', 'tshark', 'units', 'unzip', 'vim', 'vsftpd', 'watchdog', 'wget', 'wordpress', 'zsh','tensorflow',"anaconda",'apache',"zookeeper"]
+\ No newline at end of file
diff --git a/p_columbus/filetreeinfo.py b/p_columbus/filetreeinfo.py
new file mode 100644
index 0000000..3d350ec
--- /dev/null
+++ b/p_columbus/filetreeinfo.py
@@ -0,0 +1,58 @@
+import os
+from datetime import datetime
+import time
+
+class FileInfo:
+    filetype = ''
+    authority = ''
+    nohardlink = 0
+    user = ''
+    group = ''
+    size = 0
+    ctime = 0
+    path = ''
+
+    # 这里的file是一整条属性信息
+    def __init__(self, file):
+        self.filetype = self.getFileType(file)
+        self.authority = self.getAuthority(file)
+        self.nohardlink = self.getNohardlink(file)
+        self.user = self.getUser(file)
+        self.group = self.getGroup(file)
+        self.size = self.getSize(file)
+        self.ctime = self.getCtime(file)
+        self.path = self.getPath(file)
+
+    def getFileType(self, file):
+        return file[0]
+
+    def getAuthority(self, file):
+        return file[1:10]
+
+    def getNohardlink(self, file):
+        return int(file.split(' ')[1])
+
+    def getUser(self, file):
+        return file.split(' ')[2]
+
+    def getGroup(self, file):
+        return file.split(' ')[3]
+
+    def getSize(self, file):
+        return int(file.split(' ')[4].replace(',', ''))
+
+    def getCtime(self, file):
+        ads = file.index('/')
+        ctime_str = file[ads - 36:ads - 13].strip()
+        datetime_obj = datetime.strptime(ctime_str, "%Y-%m-%d %H:%M:%S.%f")
+        ctime_int = int(time.mktime(datetime_obj.timetuple()) * 1000.0 + datetime_obj.microsecond / 1000.0)  # ctime_int
+        return ctime_int
+
+    def getPath(self, file):
+        ads = file.find('/')
+        ade = file.find('>') - 2 if file.find('>') != -1 else None
+        path = file[ads:ade]
+        return path
+
+    def getFiletree(self):
+        filetree=os.popen('find  / | xargs ls -lcd --full-time ').readlines()
+\ No newline at end of file
diff --git a/p_columbus/p_columbus.py b/p_columbus/p_columbus.py
new file mode 100644
index 0000000..597764c
--- /dev/null
+++ b/p_columbus/p_columbus.py
@@ -0,0 +1,108 @@
+from sklearn.cluster import DBSCAN
+
+import category_encoders as ce
+import pandas as pd
+
+from p_columbus.filetreeinfo import FileInfo
+from columbus.columbus import columbus
+from p_columbus.applib import applabels
+
+
+#最长公共子串
+def find_lcsubstr(s1, s2):
+    # 生成0矩阵，为方便后续计算，比字符串长度多了一列
+    m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)]
+    mmax = 0   # 最长匹配的长度
+    p = 0  # 最长匹配对应在s1中的最后一位
+    for i in range(len(s1)):
+        for j in range(len(s2)):
+            if s1[i] == s2[j]:
+                m[i+1][j+1] = m[i][j] + 1
+                if m[i+1][j+1] > mmax:
+                    mmax = m[i+1][j+1]
+                    p = i+1
+    return mmax   # 返回最长子串及其长度
+
+
+def get_clusters(filelist):
+    # 处理数据，提取特征
+    feature_dic = {'ctime': [], 'user': [], 'filename': []}
+    cs = []
+    for file in filelist:
+        fileinfo = FileInfo(file)
+        if fileinfo.path.startswith('/sys/') or fileinfo.path.startswith('/proc/') or fileinfo.path.startswith(
+                '/var/lib/apt/lists/'):
+            continue
+        cs.append(FileInfo(file))
+    for item in cs:
+        feature_dic['user'].append(item.user)
+        #     feature_dic['group'].append(item.group)
+        feature_dic['ctime'].append(item.ctime)
+        filename = ''.join(item.path.split('/')[-3:])  # fn1+fn2+fn3
+        filename_vec = sum([ord(chara) for chara in list(filename)])
+        feature_dic['filename'].append(filename_vec)
+    # 密度聚类
+    data = pd.DataFrame.from_dict(feature_dic)
+    vec = ce.OneHotEncoder(cols=['user'], use_cat_names=True).fit_transform(data)
+
+    db_X = vec
+    db = DBSCAN(eps=2000, min_samples=2).fit(db_X)  # 默认距离是欧几里得距离
+    db_Y_pre = db.labels_
+    n_clusters_ = len(set(db_Y_pre)) - (1 if -1 in db_Y_pre else 0)
+    # 聚类完成
+
+    # 根据标签处理聚类结果
+    clusters = []
+    noise = []
+    for i in range(max(db_Y_pre) + 1):
+        clusters.append([])
+    for i in range(len(db_Y_pre)):
+        if (db_Y_pre[i] != -1):
+            clusters[db_Y_pre[i]].append(cs[i].path)
+        #         clusters[db_Y_pre[i]].append(feature_dic['ctime'][i])
+        else:
+            noise.append(feature_dic['filename'][i])
+    return clusters
+
+
+def p_columbus(filelist,k=10,freq_threshold = 2):
+    '''
+
+    :param filelist:文件树信息
+    :param k: 取top-k的tag
+    :param freq_threshold: 频率下限
+    :return: 软件列表
+    '''
+    #处理数据，提取特征
+    clusters=get_clusters(filelist)
+    #取top-k的tag
+    tagss = []
+    for cluster in clusters:
+        cur_dic = {}
+        tag_dict = columbus(cluster, freq_threshold=freq_threshold)
+        # have fre
+        tags = ['{}:{}'.format(tag, freq) for tag, freq
+                in tag_dict.items()]
+        if len(tags) > k:
+            cur_dic['tags'] = tags
+            #         print(tags[0])
+            tagss.extend(tags[0:k + 1])
+        else:
+            tagss.extend(tags)
+    #与软件名库匹配
+    applist=[]
+    for tag in tagss:
+        a = 0
+        pre_label = tag[:tag.find(':')]
+        #     print(pre_label)
+        for label in applabels:
+            score = find_lcsubstr(label, pre_label) / len(label)
+            if score > 0.8:
+                applist.append(label)
+
+        # if pre_label in applabels:
+        #     applist.append(pre_label)
+    return list(set(applist))
+
+
+
author	liliqing <[email protected]>	2023-06-26 00:51:47 +0800
committer	liliqing <[email protected]>	2023-06-26 00:51:47 +0800
commit	623ad2089d6cfc06b0324ce00c4cb5cf4f0db6a7 (patch)
tree	fa4d85096a6c973698562e20a37fbfd2c1b3163e /p_columbus
parent	50abdae2c22a190fef9afcb0c66791b514a2709f (diff)