columbus/columbus.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

import copy
import sys
import os
import optparse
import pickle
import logging
import glob
import yaml
from datetime import datetime

from .trie import Trie
from .tags import filtertags

# from trie import Trie
# from tags import filtertags

FILTER_PATH_TOKENS = ['usr', 'bin', 'proc', 'sys', 'etc', 'local', 'src',
                      'dev', 'home', 'root', 'lib', 'pkg', 'sbin', 'share',
                      'cache']

COLUMBUS_CACHE = {}
def is_number(s):
    try:  # 如果能运行float(s)语句，返回True（字符串s是浮点数）
        float(s)
        return True
    except ValueError:  # ValueError为Python的一种标准异常，表示"传入无效的参数"
        pass  # 如果引发了ValueError这种异常，不做任何事情（pass：不做任何事情，一般用做占位语句）
    try:
        import unicodedata  # 处理ASCii码的包
        unicodedata.numeric(s)  # 把一个表示数字的字符串转换为浮点数返回的函数
        return True
    except (TypeError, ValueError):
        pass
    return False

def refresh_columbus():
    global COLUMBUS_CACHE
    COLUMBUS_CACHE = {}


def columbus(changeset, freq_threshold=1, use_cache=False):
    """ Get labels from single changeset """
    if use_cache:
        key = str(sorted(changeset))
        if key in COLUMBUS_CACHE:
            return COLUMBUS_CACHE[key]
    tags = run_file_paths_discovery2(
        filtertags, changeset, freq_threshold=freq_threshold)
    if use_cache:
        COLUMBUS_CACHE[key] = tags
    sum=0
    for fre in tags.values():
        sum=sum+fre
    for token,fre in tags.items():
        tags[token]=fre/sum

    return tags

def columbus(changeset, freq_threshold=1, use_cache=False):
    """ Get labels from single changeset without fre """
    if use_cache:
        key = str(sorted(changeset))
        if key in COLUMBUS_CACHE:
            return COLUMBUS_CACHE[key]
    tags = run_file_paths_discovery2(
        filtertags, changeset, freq_threshold=freq_threshold)
    if use_cache:
        COLUMBUS_CACHE[key] = tags
    sum=0
    for fre in tags.values():
        sum=sum+fre
    for token,fre in tags.items():
        tags[token]=fre/sum

    return tags


def run_file_paths_discovery2(filtertags, changeset, freq_threshold=1):
    ftrie = Trie(frequency_limit=freq_threshold)
    for filepath in changeset:
        pathtokens = filepath.split('/')
        for token in pathtokens:
            if token not in FILTER_PATH_TOKENS:
                ftrie.insert(token)

    softtags = ftrie.get_all_tags()
    # softtagss=copy.deepcopy(softtags)
    for tag in filtertags:
        softtags.pop(tag, None)
        # softtags={key:value for key,value in softtags.items() if not key.startwith(tag)}
        # for key in softtags.keys():
        #     if key.startswith(tag):
        #         softtagss.pop(key,None)
    softtags = {key: value for key, value in softtags.items() if not is_number(key)}
    return softtags