diff options
| author | 马泽宇 <[email protected]> | 2019-04-04 14:37:25 +0800 |
|---|---|---|
| committer | 马泽宇 <[email protected]> | 2019-04-04 14:37:25 +0800 |
| commit | 425137e03fe73d3d65aa94164f1fdfee3d7c530d (patch) | |
| tree | 539cf7343ba8cb79cbb39b8bc4cf83ebd97e4688 | |
| parent | c2e8089398fbefcca30b2e208d5abc67d55de2e8 (diff) | |
Upload New File
| -rw-r--r-- | fingerprint.py | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/fingerprint.py b/fingerprint.py new file mode 100644 index 0000000..49b01e5 --- /dev/null +++ b/fingerprint.py @@ -0,0 +1,279 @@ +# -*- coding=UTF-8 -*- + +import optparse +import urllib.request +import urllib.parse +import ssl +import re +import os +import time +import pymysql +import json +import eventlet +from bs4 import BeautifulSoup +from lxml import etree + +class Spider(): + + def __init__(self): + """builtwith, similartech and wappalyzer results""" + self.b_tech = [] + self.s_tech = [] + self.wapp_tech = {} + self.sum_tech = [] + + """final result""" + self.output = {} + + """url list""" + self.urls = [] + + """load apps.json.py""" + self.apps = self.load_apps() + pass + def part_init(self): + self.b_tech = [] + self.s_tech = [] + self.wapp_tech = {} + self.sum_tech = [] + self.output = {} + pass + + def req(self, url, tag): #download html source code + header = { + 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' + } + context = ssl._create_unverified_context() + request = urllib.request.Request(url, headers = header) + try: + response = urllib.request.urlopen(request, timeout = 30, context = context) + html = response.read() + headers = response.headers + except Exception as e: + #print("Error:", e) + return + if tag == 2: + self.wapp(url, headers, html) + else: + self.extract(html, tag) + pass + + def extract(self, html, tag): #extract datas from html + html = etree.HTML(html)#, etree.HTMLParser()) + if tag: + self.b_tech = html.xpath('//div[@class="row mb-2 mt-2"]/div[@class="col-12"]/h2/a[@class="text-dark"]/text()') + for i in range(len(self.b_tech)): + self.b_tech[i] = self.b_tech[i].strip(' ') + #print(self.b_tech) + else: + self.s_tech = html.xpath('//div[@class="col-md-9"]/div[@class="item-title"]/a[@class="tech-name"]/text()') + for i in range(len(self.s_tech)): + self.s_tech[i] = self.s_tech[i].strip(' ') + #print(self.s_tech) + pass + + def one_query(self, url, save_tag): + sp.req("https://builtwith.com/" + url, 1) + sp.req("https://www.similartech.com/websites/" + url, 0) + sp.req("http://" + url, 2) + self.database(url, save_tag) + pass + + def group_query(self, filename, save_tag): + self.loadlist(filename) + for url in self.urls: + self.one_query(url, save_tag) + pass + + def sum(self): + self.output['othertech'] = [] + sum_tech = self.b_tech + self.s_tech + for key in self.wapp_tech: + for tech in self.wapp_tech[key]: + sum_tech.append(tech) + self.sum_tech = sorted(set(sum_tech), key = sum_tech.index) + pass + + def database(self, url, save_tag): + self.sum() + config = { + "host":"localhost", + "user":"root", + "password":"xiaoshitou", + "database":"Fingerprint" + } + try: + db = pymysql.connect(**config) + except: + print("connection failed") + cursor = db.cursor(cursor = pymysql.cursors.DictCursor) + sql = "SELECT * from TECH_1 WHERE name=%s" + for tech in self.sum_tech: + cursor.execute(sql,(tech)) + techs = cursor.fetchone() + if techs: + if techs['subcategory']: + if not self.output.__contains__(techs['subcategory']): + self.output[techs['subcategory']] = [techs['name']] + else: + self.output[techs['subcategory']].append(techs['name']) + else: + if not self.output.__contains__(techs['category']): + self.output[techs['category']] = [techs['name']] + else: + self.output[techs['category']].append(techs['name']) + else: + self.output['othertech'].append(tech) + print(url,':',self.output) + self.save_result(url, save_tag) + cursor.close() + db.close() + + def loadlist(self, filename): + with open(filename, 'r') as f: + urls = f.read() + urls = urls.split('\n') + self.urls = urls + pass + + def save_result(self, url, save_tag): + if save_tag: + with open(save_tag, 'a') as f: + f.write(url + '\n') + f.write(str(self.output) + '\n') + f.close() + self.part_init() + pass + + def load_apps(self, filename = 'apps.json.py'): + filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename) + return json.load(open(filename)) + + def wapp(self, url, headers, html): + for app_name, app_spec in self.apps['apps'].items(): + #print(app_spec) + if 'url' in app_spec: + if self.contains(url, app_spec['url']): + self.add_app(app_name, app_spec) + + if headers: + for app_name, app_spec in self.apps['apps'].items(): + if 'headers' in app_spec: + if self.contains_dict(headers, app_spec['headers']): + self.add_app(app_name, app_spec) + + if html: + for app_name, app_spec in self.apps['apps'].items(): + for key in 'html', 'script': + snippets = app_spec.get(key, []) + if not isinstance(snippets, list): + snippets = [snippets] + for snippet in snippets: + if self.contains(html, snippet): + self.add_app(app_name, app_spec) + break + + if isinstance(html, bytes): + try: + html = html.decode('utf8', 'ignore') + except: + html = html.decode('gbk') + metas = dict(re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE).findall(html)) + for app_name, app_spec in self.apps['apps'].items(): + for name, content in app_spec.get('meta', {}).items(): + if name in metas: + if self.contains(metas[name], content): + self.add_app(app_name, app_spec) + break + #print(self.wapp_tech) + + def add_app(self, app_name, app_spec): + """Add this app to technology + """ + for category in self.get_categories(app_spec): + if category not in self.wapp_tech: + self.wapp_tech[category] = [] + if app_name not in self.wapp_tech[category]: + self.wapp_tech[category].append(app_name) + implies = app_spec.get('implies', []) + if not isinstance(implies, list): + implies = [implies] + for app_name in implies: + app_name = app_name.split('\\;') + app_name = app_name[0] + self.add_app(app_name, self.apps['apps'][app_name]) + + + def get_categories(self, app_spec): + """Return category names for this app_spec + """ + return [self.apps['categories'][str(c_id)] for c_id in app_spec['cats']] + + + def contains(self, v, regex): + """Removes meta data from regex then checks for a regex match + """ + #print(regex.split('\\;')[0]) + if isinstance(v, bytes): + try: + v = v.decode('utf8', 'ignore') + except: + v = v.decode('gbk') + regex = regex.split('\\;')[0] + pattern = re.compile(regex, re.I) + eventlet.monkey_patch() + #print(regex) + with eventlet.Timeout(10, False): + m = pattern.search(v) + if m: + return m + else: + return None + #return re.compile(regex.split('\\;')[0], flags=re.I).search(v) + + + def contains_dict(self, d1, d2): + """Takes 2 dictionaries + + Returns True if d1 contains all items in d2""" + for k2, v2 in d2.items(): + v1 = d1.get(k2) + if v1: + if not self.contains(v1, v2): + return False + else: + return False + return True + +if __name__ == '__main__': + opt = optparse.OptionParser(usage = "%prog [-u] or %prog [-l]", version = "%prog 1.0") + opt.add_option('-u', '--url', + dest = '_url', + help = 'Detect the technologies used by a single website', + default = None, + type = 'string') + opt.add_option('-l', '--list', + dest = '_list', + help = 'Detect the technologies used by a group of websites', + default = None, + type = 'string') + opt.add_option('-s', '--save', + dest = '_save', + help = 'save result', + default = None, + type = 'string') + (options, args) = opt.parse_args() + + sp = Spider() + if options._save: + if options._url: + sp.one_query(options._url, options._save) + if options._list: + sp.group_query(options._list, options._save) + else: + if options._url: + sp.one_query(options._url, None) + if options._list: + sp.group_query(options._list, None) + + |
