diff options
| -rw-r--r-- | WebHopper.py | 150 |
1 files changed, 0 insertions, 150 deletions
diff --git a/WebHopper.py b/WebHopper.py deleted file mode 100644 index e583285..0000000 --- a/WebHopper.py +++ /dev/null @@ -1,150 +0,0 @@ - -import os -import pandas as pd -import threading -import requests -import builtwith -import whois -import argparse -import collections -from collector.Peeper import Peeper -from analyzer.get_chain import GetService -from Infra_analyzer.caLookup import CertResolver -from Infra_analyzer.dnsLookup import DNSResolver -from concurrent.futures import ThreadPoolExecutor -from Tools.adt.ATFilter import AdFilter, TrackerFilter - - -lock = threading.Lock() - - -class ToCSV: - - def __init__(self): - self.df = pd.DataFrame(columns=["resource_url", "isThirdParty", "resource_type", - "CA_url", "Issuer", "OCSP", "CDP", "NS", "isAd", "isTracker"]) - self.crtlook = CertResolver() - self.dnslook = DNSResolver() - self.ad = AdFilter() - self.tr = TrackerFilter() - - def lookUp(self, value): - ca_url, issuer, ocsp, crl = self.crtlook.get_CRL_OSCP(value["resource_url"]) - ns = self.dnslook.get_NS(value["resource_url"]) - # is_ad = self.ad.blocker.should_block(value["resource_url"]) - # is_tr = self.tr.blocker.should_block(value["resource_url"]) - lock.acquire() - self.df.loc[self.df.shape[0]] = [value["resource_url"], value["isThirdParty"], value["resource_type"], - ca_url, issuer, ocsp, str(crl), str(ns), False, False] - lock.release() - - -def chainAna(results): - chain = collections.defaultdict(str) - for key, value in results.items(): - if value["parent"] == "0": - continue - if value["parent"] not in results or "r" + value["parent"] not in results: - continue - chain[value["resource_url"]] = results[value["parent"]]["resource_url"] - - chains = [] - for key, value in chain.items(): - li = [key] - while value and value != "0" and value != key and value in chain and value[-1] != "/" and value not in li: - li.append(value) - value = chain[value] - if len(li) > 20: - break - if len(li) >= 2: - print(li) - chains.append(li) - return chains - - -def page_resource(path, dirname, sp): - dumper = ToCSV() - ana = GetService() - results = ana.run(path) - js_rank = [] - pool = ThreadPoolExecutor(max_workers=7) - seen = set() - for key, value in results.items(): - if not value["resource_url"] or value["resource_url"] in seen: - continue - seen.add(value["resource_url"]) - if value["resource_type"] == 1: - js_rank.append((value["resource_url"], value["score"])) - pool.submit(dumper.lookUp, value) - pool.shutdown() - js_rank.sort(key=lambda x: x[1], reverse=True) - print("-----------------js排名情况------------------") - for js, _ in js_rank: - print(js) - dumper.df.to_csv(sp + dirname + ".csv", index=False) - - print("-----------------引用链------------------") - chains = chainAna(results) - f = open(sp + "chain.txt", "w") - for chain in chains: - f.write(str(chain) + "\n") - f.close() - - -def run(domain): - url = "http://" + domain - root = "/Users/mazeyu/rendering_stream/" - me = Peeper() - dirname, today = me.peeping(url) - rdir = root + dirname - filename = os.listdir(rdir)[0] - - path = os.path.join(rdir, filename) - sp = "./result/" + dirname + today + "/" - page_resource(path, dirname, sp) - - print("-----------------whois信息------------------") - wh = whois.whois(domain) - print(wh) - f = open(sp + "whois", "w") - f.write(str(wh)) - f.close() - - print("-----------------响应头------------------") - header = { - "headers": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) " - "Version/14.1.2 Safari/605.1.15" - } - req = requests.get(url, headers=header) - for key, value in req.headers.items(): - print(key, value) - f = open(sp + "header", "w") - f.write(str(req.headers)) - f.close() - - print("-----------------组件使用情况------------------") - components = builtwith.parse(url) - for key, value in components.items(): - print(key, value) - f = open(sp + "component", "w") - f.write(str(components)) - f.close() - - # page_resource("/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json", "www.shandong-energy.com") - - -if __name__ == "__main__": - """ - api - 登录注释 - 算分情况 - wapp - linux - easylist更新 - 域名 - """ - # run("www.baidu.com") - run("wanfangdata.com.cn") - # run("csdn.net") - # run("www.bilibili.com") - # run("www.piaoliang.com") |
