diff options
Diffstat (limited to 'columbus/columbus.py')
| -rw-r--r-- | columbus/columbus.py | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/columbus/columbus.py b/columbus/columbus.py new file mode 100644 index 0000000..9aa7919 --- /dev/null +++ b/columbus/columbus.py @@ -0,0 +1,95 @@ +import copy +import sys +import os +import optparse +import pickle +import logging +import glob +import yaml +from datetime import datetime + +from .trie import Trie +from .tags import filtertags + +# from trie import Trie +# from tags import filtertags + +FILTER_PATH_TOKENS = ['usr', 'bin', 'proc', 'sys', 'etc', 'local', 'src', + 'dev', 'home', 'root', 'lib', 'pkg', 'sbin', 'share', + 'cache'] + +COLUMBUS_CACHE = {} +def is_number(s): + try: # 如果能运行float(s)语句,返回True(字符串s是浮点数) + float(s) + return True + except ValueError: # ValueError为Python的一种标准异常,表示"传入无效的参数" + pass # 如果引发了ValueError这种异常,不做任何事情(pass:不做任何事情,一般用做占位语句) + try: + import unicodedata # 处理ASCii码的包 + unicodedata.numeric(s) # 把一个表示数字的字符串转换为浮点数返回的函数 + return True + except (TypeError, ValueError): + pass + return False + +def refresh_columbus(): + global COLUMBUS_CACHE + COLUMBUS_CACHE = {} + + +def columbus(changeset, freq_threshold=1, use_cache=False): + """ Get labels from single changeset """ + if use_cache: + key = str(sorted(changeset)) + if key in COLUMBUS_CACHE: + return COLUMBUS_CACHE[key] + tags = run_file_paths_discovery2( + filtertags, changeset, freq_threshold=freq_threshold) + if use_cache: + COLUMBUS_CACHE[key] = tags + sum=0 + for fre in tags.values(): + sum=sum+fre + for token,fre in tags.items(): + tags[token]=fre/sum + + return tags + +def columbus(changeset, freq_threshold=1, use_cache=False): + """ Get labels from single changeset without fre """ + if use_cache: + key = str(sorted(changeset)) + if key in COLUMBUS_CACHE: + return COLUMBUS_CACHE[key] + tags = run_file_paths_discovery2( + filtertags, changeset, freq_threshold=freq_threshold) + if use_cache: + COLUMBUS_CACHE[key] = tags + sum=0 + for fre in tags.values(): + sum=sum+fre + for token,fre in tags.items(): + tags[token]=fre/sum + + return tags + + +def run_file_paths_discovery2(filtertags, changeset, freq_threshold=1): + ftrie = Trie(frequency_limit=freq_threshold) + for filepath in changeset: + pathtokens = filepath.split('/') + for token in pathtokens: + if token not in FILTER_PATH_TOKENS: + ftrie.insert(token) + + softtags = ftrie.get_all_tags() + # softtagss=copy.deepcopy(softtags) + for tag in filtertags: + softtags.pop(tag, None) + # softtags={key:value for key,value in softtags.items() if not key.startwith(tag)} + # for key in softtags.keys(): + # if key.startswith(tag): + # softtagss.pop(key,None) + softtags = {key: value for key, value in softtags.items() if not is_number(key)} + return softtags |
