blob: 27d983b2a5368e41f45b85513be075b020dd83e8 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
import tldextract
class Extracter:
"""
extract the subdomain and check whether the resource is a third-parth
ipt: a resource url
opt: resource host, isThirdParty(0 False, 1 True)
"""
def __init__(self, webiste):
self.website = webiste
self.host, self.domain = self.extract(self.website)
@staticmethod
def extract(url):
subdomain, domain, suffix = tldextract.extract(url)
if not subdomain:
host = domain + "." + suffix
else:
host = subdomain + '.' + domain + '.' + suffix
domain = domain + "." + suffix
return host, domain
def isThirdParty(self, url):
if not url:
return None, None
isThirdParty = 0
host, domain = self.extract(url)
if domain != self.domain:
isThirdParty = 1
return host, isThirdParty
|