1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
import copy
import sys
import os
import optparse
import pickle
import logging
import glob
import yaml
from datetime import datetime
from .trie import Trie
from .tags import filtertags
# from trie import Trie
# from tags import filtertags
FILTER_PATH_TOKENS = ['usr', 'bin', 'proc', 'sys', 'etc', 'local', 'src',
'dev', 'home', 'root', 'lib', 'pkg', 'sbin', 'share',
'cache']
COLUMBUS_CACHE = {}
def is_number(s):
try: # 如果能运行float(s)语句,返回True(字符串s是浮点数)
float(s)
return True
except ValueError: # ValueError为Python的一种标准异常,表示"传入无效的参数"
pass # 如果引发了ValueError这种异常,不做任何事情(pass:不做任何事情,一般用做占位语句)
try:
import unicodedata # 处理ASCii码的包
unicodedata.numeric(s) # 把一个表示数字的字符串转换为浮点数返回的函数
return True
except (TypeError, ValueError):
pass
return False
def refresh_columbus():
global COLUMBUS_CACHE
COLUMBUS_CACHE = {}
def columbus(changeset, freq_threshold=1, use_cache=False):
""" Get labels from single changeset """
if use_cache:
key = str(sorted(changeset))
if key in COLUMBUS_CACHE:
return COLUMBUS_CACHE[key]
tags = run_file_paths_discovery2(
filtertags, changeset, freq_threshold=freq_threshold)
if use_cache:
COLUMBUS_CACHE[key] = tags
sum=0
for fre in tags.values():
sum=sum+fre
for token,fre in tags.items():
tags[token]=fre/sum
return tags
def columbus(changeset, freq_threshold=1, use_cache=False):
""" Get labels from single changeset without fre """
if use_cache:
key = str(sorted(changeset))
if key in COLUMBUS_CACHE:
return COLUMBUS_CACHE[key]
tags = run_file_paths_discovery2(
filtertags, changeset, freq_threshold=freq_threshold)
if use_cache:
COLUMBUS_CACHE[key] = tags
sum=0
for fre in tags.values():
sum=sum+fre
for token,fre in tags.items():
tags[token]=fre/sum
return tags
def run_file_paths_discovery2(filtertags, changeset, freq_threshold=1):
ftrie = Trie(frequency_limit=freq_threshold)
for filepath in changeset:
pathtokens = filepath.split('/')
for token in pathtokens:
if token not in FILTER_PATH_TOKENS:
ftrie.insert(token)
softtags = ftrie.get_all_tags()
# softtagss=copy.deepcopy(softtags)
for tag in filtertags:
softtags.pop(tag, None)
# softtags={key:value for key,value in softtags.items() if not key.startwith(tag)}
# for key in softtags.keys():
# if key.startswith(tag):
# softtagss.pop(key,None)
softtags = {key: value for key, value in softtags.items() if not is_number(key)}
return softtags
|