summaryrefslogtreecommitdiff
path: root/Tools/domain_extract.py
blob: 27d983b2a5368e41f45b85513be075b020dd83e8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

import tldextract


class Extracter:
    """
    extract the subdomain and check whether the resource is a third-parth
    ipt: a resource url
    opt: resource host, isThirdParty(0 False, 1 True)
    """
    def __init__(self, webiste):
        self.website = webiste
        self.host, self.domain = self.extract(self.website)

    @staticmethod
    def extract(url):
        subdomain, domain, suffix = tldextract.extract(url)
        if not subdomain:
            host = domain + "." + suffix
        else:
            host = subdomain + '.' + domain + '.' + suffix
        domain = domain + "." + suffix
        return host, domain

    def isThirdParty(self, url):
        if not url:
            return None, None
        isThirdParty = 0
        host, domain = self.extract(url)
        if domain != self.domain:
            isThirdParty = 1
        return host, isThirdParty