summaryrefslogtreecommitdiff
path: root/columbus/trie.py
blob: c62805d019bb8adcaf7b501fa2e1de056642a884 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class TrieEntry:
    def __init__(self):
        self.count = 1
        self.child_list = []

class Trie:
    def __init__(self, frequency_limit=1):
        self.map = {}
        self.frequency_limit = frequency_limit

    def insert(self, token):
        if not token:
            return
        token = token.replace(':', '').replace('|', '').lower()
        parent = None
        while len(token) >= 3:
            if token in self.map:
                self.map[token].count += 1
            else:
                self.map[token] = TrieEntry()
            if parent and parent not in self.map[token].child_list:
                self.map[token].child_list.append(parent)
            parent = token
            token = token[:-1]

    def get_all_tags(self):
        results = {}
        for token, entry in self.map.items():
            # if len(entry.child_list) >= self.frequency_limit:
            #     results[token] = entry.count
            if len(entry.child_list)>0:
                for parent in entry.child_list:
                    if self.map[parent].count<entry.count and entry.count>self.frequency_limit:
                        results[token]=entry.count
                        break
        return dict(sorted(results.items(),key=lambda item:item[1],reverse=True))