diff options
| author | liliqing <[email protected]> | 2023-06-26 00:51:47 +0800 |
|---|---|---|
| committer | liliqing <[email protected]> | 2023-06-26 00:51:47 +0800 |
| commit | 623ad2089d6cfc06b0324ce00c4cb5cf4f0db6a7 (patch) | |
| tree | fa4d85096a6c973698562e20a37fbfd2c1b3163e /p_columbus | |
| parent | 50abdae2c22a190fef9afcb0c66791b514a2709f (diff) | |
Diffstat (limited to 'p_columbus')
| -rw-r--r-- | p_columbus/__pycache__/applib.cpython-37.pyc | bin | 0 -> 1031 bytes | |||
| -rw-r--r-- | p_columbus/__pycache__/filetreeinfo.cpython-37.pyc | bin | 0 -> 2466 bytes | |||
| -rw-r--r-- | p_columbus/__pycache__/p_columbus.cpython-37.pyc | bin | 0 -> 3586 bytes | |||
| -rw-r--r-- | p_columbus/applib.py | 1 | ||||
| -rw-r--r-- | p_columbus/filetreeinfo.py | 58 | ||||
| -rw-r--r-- | p_columbus/p_columbus.py | 108 |
6 files changed, 167 insertions, 0 deletions
diff --git a/p_columbus/__pycache__/applib.cpython-37.pyc b/p_columbus/__pycache__/applib.cpython-37.pyc Binary files differnew file mode 100644 index 0000000..8644c3c --- /dev/null +++ b/p_columbus/__pycache__/applib.cpython-37.pyc diff --git a/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc Binary files differnew file mode 100644 index 0000000..201f938 --- /dev/null +++ b/p_columbus/__pycache__/filetreeinfo.cpython-37.pyc diff --git a/p_columbus/__pycache__/p_columbus.cpython-37.pyc b/p_columbus/__pycache__/p_columbus.cpython-37.pyc Binary files differnew file mode 100644 index 0000000..c69bd15 --- /dev/null +++ b/p_columbus/__pycache__/p_columbus.cpython-37.pyc diff --git a/p_columbus/applib.py b/p_columbus/applib.py new file mode 100644 index 0000000..b8e9bac --- /dev/null +++ b/p_columbus/applib.py @@ -0,0 +1 @@ +applabels=['ansible', 'apache2', 'arpwatch', 'bacula-client', 'bind9', 'cobertura', 'cockroachdb', 'containerd', 'coredns', 'crda', 'curl', 'dcraw', 'deno', 'docker', 'docker.io', 'dovecot-core', 'dstat', 'emacs-nox', 'etcd', 'finger', 'flutter', 'freeradius', 'go', 'haproxy', 'iotop', 'iptraf-ng', 'kafka', 'ksh', 'kubernetes', 'lftp', 'lighttpd', 'logwatch', 'lshw', 'lynx', 'marisa', 'memcached', 'mgetty-voice', 'mtr', 'mutt', 'nginx', 'nmap', 'nodejs', 'openjdk-8-jdk', 'orientdb', 'pax', 'postgresql', 'powertop', 'prometheus', 'puppet', 'python3', 'qemu-kvm', 'quagga', 'quota', 'rabbitmq-server', 'rdist', 'redis', 'rocketmq', 'sane', 'smartmontools', 'squid', 'subversion', 'supermin', 'sysstat', 'thrift', 'tomcat', 'traceroute', 'tree', 'tshark', 'units', 'unzip', 'vim', 'vsftpd', 'watchdog', 'wget', 'wordpress', 'zsh','tensorflow',"anaconda",'apache',"zookeeper"]
\ No newline at end of file diff --git a/p_columbus/filetreeinfo.py b/p_columbus/filetreeinfo.py new file mode 100644 index 0000000..3d350ec --- /dev/null +++ b/p_columbus/filetreeinfo.py @@ -0,0 +1,58 @@ +import os +from datetime import datetime +import time + +class FileInfo: + filetype = '' + authority = '' + nohardlink = 0 + user = '' + group = '' + size = 0 + ctime = 0 + path = '' + + # 这里的file是一整条属性信息 + def __init__(self, file): + self.filetype = self.getFileType(file) + self.authority = self.getAuthority(file) + self.nohardlink = self.getNohardlink(file) + self.user = self.getUser(file) + self.group = self.getGroup(file) + self.size = self.getSize(file) + self.ctime = self.getCtime(file) + self.path = self.getPath(file) + + def getFileType(self, file): + return file[0] + + def getAuthority(self, file): + return file[1:10] + + def getNohardlink(self, file): + return int(file.split(' ')[1]) + + def getUser(self, file): + return file.split(' ')[2] + + def getGroup(self, file): + return file.split(' ')[3] + + def getSize(self, file): + return int(file.split(' ')[4].replace(',', '')) + + def getCtime(self, file): + ads = file.index('/') + ctime_str = file[ads - 36:ads - 13].strip() + datetime_obj = datetime.strptime(ctime_str, "%Y-%m-%d %H:%M:%S.%f") + ctime_int = int(time.mktime(datetime_obj.timetuple()) * 1000.0 + datetime_obj.microsecond / 1000.0) # ctime_int + return ctime_int + + def getPath(self, file): + ads = file.find('/') + ade = file.find('>') - 2 if file.find('>') != -1 else None + path = file[ads:ade] + return path + + def getFiletree(self): + filetree=os.popen('find / | xargs ls -lcd --full-time ').readlines()
\ No newline at end of file diff --git a/p_columbus/p_columbus.py b/p_columbus/p_columbus.py new file mode 100644 index 0000000..597764c --- /dev/null +++ b/p_columbus/p_columbus.py @@ -0,0 +1,108 @@ +from sklearn.cluster import DBSCAN + +import category_encoders as ce +import pandas as pd + +from p_columbus.filetreeinfo import FileInfo +from columbus.columbus import columbus +from p_columbus.applib import applabels + + +#最长公共子串 +def find_lcsubstr(s1, s2): + # 生成0矩阵,为方便后续计算,比字符串长度多了一列 + m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)] + mmax = 0 # 最长匹配的长度 + p = 0 # 最长匹配对应在s1中的最后一位 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i+1][j+1] = m[i][j] + 1 + if m[i+1][j+1] > mmax: + mmax = m[i+1][j+1] + p = i+1 + return mmax # 返回最长子串及其长度 + + +def get_clusters(filelist): + # 处理数据,提取特征 + feature_dic = {'ctime': [], 'user': [], 'filename': []} + cs = [] + for file in filelist: + fileinfo = FileInfo(file) + if fileinfo.path.startswith('/sys/') or fileinfo.path.startswith('/proc/') or fileinfo.path.startswith( + '/var/lib/apt/lists/'): + continue + cs.append(FileInfo(file)) + for item in cs: + feature_dic['user'].append(item.user) + # feature_dic['group'].append(item.group) + feature_dic['ctime'].append(item.ctime) + filename = ''.join(item.path.split('/')[-3:]) # fn1+fn2+fn3 + filename_vec = sum([ord(chara) for chara in list(filename)]) + feature_dic['filename'].append(filename_vec) + # 密度聚类 + data = pd.DataFrame.from_dict(feature_dic) + vec = ce.OneHotEncoder(cols=['user'], use_cat_names=True).fit_transform(data) + + db_X = vec + db = DBSCAN(eps=2000, min_samples=2).fit(db_X) # 默认距离是欧几里得距离 + db_Y_pre = db.labels_ + n_clusters_ = len(set(db_Y_pre)) - (1 if -1 in db_Y_pre else 0) + # 聚类完成 + + # 根据标签处理聚类结果 + clusters = [] + noise = [] + for i in range(max(db_Y_pre) + 1): + clusters.append([]) + for i in range(len(db_Y_pre)): + if (db_Y_pre[i] != -1): + clusters[db_Y_pre[i]].append(cs[i].path) + # clusters[db_Y_pre[i]].append(feature_dic['ctime'][i]) + else: + noise.append(feature_dic['filename'][i]) + return clusters + + +def p_columbus(filelist,k=10,freq_threshold = 2): + ''' + + :param filelist:文件树信息 + :param k: 取top-k的tag + :param freq_threshold: 频率下限 + :return: 软件列表 + ''' + #处理数据,提取特征 + clusters=get_clusters(filelist) + #取top-k的tag + tagss = [] + for cluster in clusters: + cur_dic = {} + tag_dict = columbus(cluster, freq_threshold=freq_threshold) + # have fre + tags = ['{}:{}'.format(tag, freq) for tag, freq + in tag_dict.items()] + if len(tags) > k: + cur_dic['tags'] = tags + # print(tags[0]) + tagss.extend(tags[0:k + 1]) + else: + tagss.extend(tags) + #与软件名库匹配 + applist=[] + for tag in tagss: + a = 0 + pre_label = tag[:tag.find(':')] + # print(pre_label) + for label in applabels: + score = find_lcsubstr(label, pre_label) / len(label) + if score > 0.8: + applist.append(label) + + # if pre_label in applabels: + # applist.append(pre_label) + return list(set(applist)) + + + |
