summaryrefslogtreecommitdiff
path: root/p_columbus
diff options
context:
space:
mode:
authorliliqing <[email protected]>2023-06-26 00:51:47 +0800
committerliliqing <[email protected]>2023-06-26 00:51:47 +0800
commit623ad2089d6cfc06b0324ce00c4cb5cf4f0db6a7 (patch)
treefa4d85096a6c973698562e20a37fbfd2c1b3163e /p_columbus
parent50abdae2c22a190fef9afcb0c66791b514a2709f (diff)
“first”HEADmain
Diffstat (limited to 'p_columbus')
-rw-r--r--p_columbus/__pycache__/applib.cpython-37.pycbin0 -> 1031 bytes
-rw-r--r--p_columbus/__pycache__/filetreeinfo.cpython-37.pycbin0 -> 2466 bytes
-rw-r--r--p_columbus/__pycache__/p_columbus.cpython-37.pycbin0 -> 3586 bytes
-rw-r--r--p_columbus/applib.py1
-rw-r--r--p_columbus/filetreeinfo.py58
-rw-r--r--p_columbus/p_columbus.py108
6 files changed, 167 insertions, 0 deletions
diff --git a/p_columbus/__pycache__/applib.cpython-37.pyc b/p_columbus/__pycache__/applib.cpython-37.pyc
new file mode 100644
index 0000000..8644c3c
--- /dev/null
+++ b/p_columbus/__pycache__/applib.cpython-37.pyc
Binary files differ
diff --git a/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc
new file mode 100644
index 0000000..201f938
--- /dev/null
+++ b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc
Binary files differ
diff --git a/p_columbus/__pycache__/p_columbus.cpython-37.pyc b/p_columbus/__pycache__/p_columbus.cpython-37.pyc
new file mode 100644
index 0000000..c69bd15
--- /dev/null
+++ b/p_columbus/__pycache__/p_columbus.cpython-37.pyc
Binary files differ
diff --git a/p_columbus/applib.py b/p_columbus/applib.py
new file mode 100644
index 0000000..b8e9bac
--- /dev/null
+++ b/p_columbus/applib.py
@@ -0,0 +1 @@
+applabels=['ansible', 'apache2', 'arpwatch', 'bacula-client', 'bind9', 'cobertura', 'cockroachdb', 'containerd', 'coredns', 'crda', 'curl', 'dcraw', 'deno', 'docker', 'docker.io', 'dovecot-core', 'dstat', 'emacs-nox', 'etcd', 'finger', 'flutter', 'freeradius', 'go', 'haproxy', 'iotop', 'iptraf-ng', 'kafka', 'ksh', 'kubernetes', 'lftp', 'lighttpd', 'logwatch', 'lshw', 'lynx', 'marisa', 'memcached', 'mgetty-voice', 'mtr', 'mutt', 'nginx', 'nmap', 'nodejs', 'openjdk-8-jdk', 'orientdb', 'pax', 'postgresql', 'powertop', 'prometheus', 'puppet', 'python3', 'qemu-kvm', 'quagga', 'quota', 'rabbitmq-server', 'rdist', 'redis', 'rocketmq', 'sane', 'smartmontools', 'squid', 'subversion', 'supermin', 'sysstat', 'thrift', 'tomcat', 'traceroute', 'tree', 'tshark', 'units', 'unzip', 'vim', 'vsftpd', 'watchdog', 'wget', 'wordpress', 'zsh','tensorflow',"anaconda",'apache',"zookeeper"] \ No newline at end of file
diff --git a/p_columbus/filetreeinfo.py b/p_columbus/filetreeinfo.py
new file mode 100644
index 0000000..3d350ec
--- /dev/null
+++ b/p_columbus/filetreeinfo.py
@@ -0,0 +1,58 @@
+import os
+from datetime import datetime
+import time
+
+class FileInfo:
+ filetype = ''
+ authority = ''
+ nohardlink = 0
+ user = ''
+ group = ''
+ size = 0
+ ctime = 0
+ path = ''
+
+ # 这里的file是一整条属性信息
+ def __init__(self, file):
+ self.filetype = self.getFileType(file)
+ self.authority = self.getAuthority(file)
+ self.nohardlink = self.getNohardlink(file)
+ self.user = self.getUser(file)
+ self.group = self.getGroup(file)
+ self.size = self.getSize(file)
+ self.ctime = self.getCtime(file)
+ self.path = self.getPath(file)
+
+ def getFileType(self, file):
+ return file[0]
+
+ def getAuthority(self, file):
+ return file[1:10]
+
+ def getNohardlink(self, file):
+ return int(file.split(' ')[1])
+
+ def getUser(self, file):
+ return file.split(' ')[2]
+
+ def getGroup(self, file):
+ return file.split(' ')[3]
+
+ def getSize(self, file):
+ return int(file.split(' ')[4].replace(',', ''))
+
+ def getCtime(self, file):
+ ads = file.index('/')
+ ctime_str = file[ads - 36:ads - 13].strip()
+ datetime_obj = datetime.strptime(ctime_str, "%Y-%m-%d %H:%M:%S.%f")
+ ctime_int = int(time.mktime(datetime_obj.timetuple()) * 1000.0 + datetime_obj.microsecond / 1000.0) # ctime_int
+ return ctime_int
+
+ def getPath(self, file):
+ ads = file.find('/')
+ ade = file.find('>') - 2 if file.find('>') != -1 else None
+ path = file[ads:ade]
+ return path
+
+ def getFiletree(self):
+ filetree=os.popen('find / | xargs ls -lcd --full-time ').readlines() \ No newline at end of file
diff --git a/p_columbus/p_columbus.py b/p_columbus/p_columbus.py
new file mode 100644
index 0000000..597764c
--- /dev/null
+++ b/p_columbus/p_columbus.py
@@ -0,0 +1,108 @@
+from sklearn.cluster import DBSCAN
+
+import category_encoders as ce
+import pandas as pd
+
+from p_columbus.filetreeinfo import FileInfo
+from columbus.columbus import columbus
+from p_columbus.applib import applabels
+
+
+#最长公共子串
+def find_lcsubstr(s1, s2):
+ # 生成0矩阵,为方便后续计算,比字符串长度多了一列
+ m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)]
+ mmax = 0 # 最长匹配的长度
+ p = 0 # 最长匹配对应在s1中的最后一位
+ for i in range(len(s1)):
+ for j in range(len(s2)):
+ if s1[i] == s2[j]:
+ m[i+1][j+1] = m[i][j] + 1
+ if m[i+1][j+1] > mmax:
+ mmax = m[i+1][j+1]
+ p = i+1
+ return mmax # 返回最长子串及其长度
+
+
+def get_clusters(filelist):
+ # 处理数据,提取特征
+ feature_dic = {'ctime': [], 'user': [], 'filename': []}
+ cs = []
+ for file in filelist:
+ fileinfo = FileInfo(file)
+ if fileinfo.path.startswith('/sys/') or fileinfo.path.startswith('/proc/') or fileinfo.path.startswith(
+ '/var/lib/apt/lists/'):
+ continue
+ cs.append(FileInfo(file))
+ for item in cs:
+ feature_dic['user'].append(item.user)
+ # feature_dic['group'].append(item.group)
+ feature_dic['ctime'].append(item.ctime)
+ filename = ''.join(item.path.split('/')[-3:]) # fn1+fn2+fn3
+ filename_vec = sum([ord(chara) for chara in list(filename)])
+ feature_dic['filename'].append(filename_vec)
+ # 密度聚类
+ data = pd.DataFrame.from_dict(feature_dic)
+ vec = ce.OneHotEncoder(cols=['user'], use_cat_names=True).fit_transform(data)
+
+ db_X = vec
+ db = DBSCAN(eps=2000, min_samples=2).fit(db_X) # 默认距离是欧几里得距离
+ db_Y_pre = db.labels_
+ n_clusters_ = len(set(db_Y_pre)) - (1 if -1 in db_Y_pre else 0)
+ # 聚类完成
+
+ # 根据标签处理聚类结果
+ clusters = []
+ noise = []
+ for i in range(max(db_Y_pre) + 1):
+ clusters.append([])
+ for i in range(len(db_Y_pre)):
+ if (db_Y_pre[i] != -1):
+ clusters[db_Y_pre[i]].append(cs[i].path)
+ # clusters[db_Y_pre[i]].append(feature_dic['ctime'][i])
+ else:
+ noise.append(feature_dic['filename'][i])
+ return clusters
+
+
+def p_columbus(filelist,k=10,freq_threshold = 2):
+ '''
+
+ :param filelist:文件树信息
+ :param k: 取top-k的tag
+ :param freq_threshold: 频率下限
+ :return: 软件列表
+ '''
+ #处理数据,提取特征
+ clusters=get_clusters(filelist)
+ #取top-k的tag
+ tagss = []
+ for cluster in clusters:
+ cur_dic = {}
+ tag_dict = columbus(cluster, freq_threshold=freq_threshold)
+ # have fre
+ tags = ['{}:{}'.format(tag, freq) for tag, freq
+ in tag_dict.items()]
+ if len(tags) > k:
+ cur_dic['tags'] = tags
+ # print(tags[0])
+ tagss.extend(tags[0:k + 1])
+ else:
+ tagss.extend(tags)
+ #与软件名库匹配
+ applist=[]
+ for tag in tagss:
+ a = 0
+ pre_label = tag[:tag.find(':')]
+ # print(pre_label)
+ for label in applabels:
+ score = find_lcsubstr(label, pre_label) / len(label)
+ if score > 0.8:
+ applist.append(label)
+
+ # if pre_label in applabels:
+ # applist.append(pre_label)
+ return list(set(applist))
+
+
+