summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author马泽宇 <[email protected]>2019-04-04 14:37:25 +0800
committer马泽宇 <[email protected]>2019-04-04 14:37:25 +0800
commit425137e03fe73d3d65aa94164f1fdfee3d7c530d (patch)
tree539cf7343ba8cb79cbb39b8bc4cf83ebd97e4688
parentc2e8089398fbefcca30b2e208d5abc67d55de2e8 (diff)
Upload New File
-rw-r--r--fingerprint.py279
1 files changed, 279 insertions, 0 deletions
diff --git a/fingerprint.py b/fingerprint.py
new file mode 100644
index 0000000..49b01e5
--- /dev/null
+++ b/fingerprint.py
@@ -0,0 +1,279 @@
+# -*- coding=UTF-8 -*-
+
+import optparse
+import urllib.request
+import urllib.parse
+import ssl
+import re
+import os
+import time
+import pymysql
+import json
+import eventlet
+from bs4 import BeautifulSoup
+from lxml import etree
+
+class Spider():
+
+ def __init__(self):
+ """builtwith, similartech and wappalyzer results"""
+ self.b_tech = []
+ self.s_tech = []
+ self.wapp_tech = {}
+ self.sum_tech = []
+
+ """final result"""
+ self.output = {}
+
+ """url list"""
+ self.urls = []
+
+ """load apps.json.py"""
+ self.apps = self.load_apps()
+ pass
+ def part_init(self):
+ self.b_tech = []
+ self.s_tech = []
+ self.wapp_tech = {}
+ self.sum_tech = []
+ self.output = {}
+ pass
+
+ def req(self, url, tag): #download html source code
+ header = {
+ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
+ }
+ context = ssl._create_unverified_context()
+ request = urllib.request.Request(url, headers = header)
+ try:
+ response = urllib.request.urlopen(request, timeout = 30, context = context)
+ html = response.read()
+ headers = response.headers
+ except Exception as e:
+ #print("Error:", e)
+ return
+ if tag == 2:
+ self.wapp(url, headers, html)
+ else:
+ self.extract(html, tag)
+ pass
+
+ def extract(self, html, tag): #extract datas from html
+ html = etree.HTML(html)#, etree.HTMLParser())
+ if tag:
+ self.b_tech = html.xpath('//div[@class="row mb-2 mt-2"]/div[@class="col-12"]/h2/a[@class="text-dark"]/text()')
+ for i in range(len(self.b_tech)):
+ self.b_tech[i] = self.b_tech[i].strip(' ')
+ #print(self.b_tech)
+ else:
+ self.s_tech = html.xpath('//div[@class="col-md-9"]/div[@class="item-title"]/a[@class="tech-name"]/text()')
+ for i in range(len(self.s_tech)):
+ self.s_tech[i] = self.s_tech[i].strip(' ')
+ #print(self.s_tech)
+ pass
+
+ def one_query(self, url, save_tag):
+ sp.req("https://builtwith.com/" + url, 1)
+ sp.req("https://www.similartech.com/websites/" + url, 0)
+ sp.req("http://" + url, 2)
+ self.database(url, save_tag)
+ pass
+
+ def group_query(self, filename, save_tag):
+ self.loadlist(filename)
+ for url in self.urls:
+ self.one_query(url, save_tag)
+ pass
+
+ def sum(self):
+ self.output['othertech'] = []
+ sum_tech = self.b_tech + self.s_tech
+ for key in self.wapp_tech:
+ for tech in self.wapp_tech[key]:
+ sum_tech.append(tech)
+ self.sum_tech = sorted(set(sum_tech), key = sum_tech.index)
+ pass
+
+ def database(self, url, save_tag):
+ self.sum()
+ config = {
+ "host":"localhost",
+ "user":"root",
+ "password":"xiaoshitou",
+ "database":"Fingerprint"
+ }
+ try:
+ db = pymysql.connect(**config)
+ except:
+ print("connection failed")
+ cursor = db.cursor(cursor = pymysql.cursors.DictCursor)
+ sql = "SELECT * from TECH_1 WHERE name=%s"
+ for tech in self.sum_tech:
+ cursor.execute(sql,(tech))
+ techs = cursor.fetchone()
+ if techs:
+ if techs['subcategory']:
+ if not self.output.__contains__(techs['subcategory']):
+ self.output[techs['subcategory']] = [techs['name']]
+ else:
+ self.output[techs['subcategory']].append(techs['name'])
+ else:
+ if not self.output.__contains__(techs['category']):
+ self.output[techs['category']] = [techs['name']]
+ else:
+ self.output[techs['category']].append(techs['name'])
+ else:
+ self.output['othertech'].append(tech)
+ print(url,':',self.output)
+ self.save_result(url, save_tag)
+ cursor.close()
+ db.close()
+
+ def loadlist(self, filename):
+ with open(filename, 'r') as f:
+ urls = f.read()
+ urls = urls.split('\n')
+ self.urls = urls
+ pass
+
+ def save_result(self, url, save_tag):
+ if save_tag:
+ with open(save_tag, 'a') as f:
+ f.write(url + '\n')
+ f.write(str(self.output) + '\n')
+ f.close()
+ self.part_init()
+ pass
+
+ def load_apps(self, filename = 'apps.json.py'):
+ filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
+ return json.load(open(filename))
+
+ def wapp(self, url, headers, html):
+ for app_name, app_spec in self.apps['apps'].items():
+ #print(app_spec)
+ if 'url' in app_spec:
+ if self.contains(url, app_spec['url']):
+ self.add_app(app_name, app_spec)
+
+ if headers:
+ for app_name, app_spec in self.apps['apps'].items():
+ if 'headers' in app_spec:
+ if self.contains_dict(headers, app_spec['headers']):
+ self.add_app(app_name, app_spec)
+
+ if html:
+ for app_name, app_spec in self.apps['apps'].items():
+ for key in 'html', 'script':
+ snippets = app_spec.get(key, [])
+ if not isinstance(snippets, list):
+ snippets = [snippets]
+ for snippet in snippets:
+ if self.contains(html, snippet):
+ self.add_app(app_name, app_spec)
+ break
+
+ if isinstance(html, bytes):
+ try:
+ html = html.decode('utf8', 'ignore')
+ except:
+ html = html.decode('gbk')
+ metas = dict(re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE).findall(html))
+ for app_name, app_spec in self.apps['apps'].items():
+ for name, content in app_spec.get('meta', {}).items():
+ if name in metas:
+ if self.contains(metas[name], content):
+ self.add_app(app_name, app_spec)
+ break
+ #print(self.wapp_tech)
+
+ def add_app(self, app_name, app_spec):
+ """Add this app to technology
+ """
+ for category in self.get_categories(app_spec):
+ if category not in self.wapp_tech:
+ self.wapp_tech[category] = []
+ if app_name not in self.wapp_tech[category]:
+ self.wapp_tech[category].append(app_name)
+ implies = app_spec.get('implies', [])
+ if not isinstance(implies, list):
+ implies = [implies]
+ for app_name in implies:
+ app_name = app_name.split('\\;')
+ app_name = app_name[0]
+ self.add_app(app_name, self.apps['apps'][app_name])
+
+
+ def get_categories(self, app_spec):
+ """Return category names for this app_spec
+ """
+ return [self.apps['categories'][str(c_id)] for c_id in app_spec['cats']]
+
+
+ def contains(self, v, regex):
+ """Removes meta data from regex then checks for a regex match
+ """
+ #print(regex.split('\\;')[0])
+ if isinstance(v, bytes):
+ try:
+ v = v.decode('utf8', 'ignore')
+ except:
+ v = v.decode('gbk')
+ regex = regex.split('\\;')[0]
+ pattern = re.compile(regex, re.I)
+ eventlet.monkey_patch()
+ #print(regex)
+ with eventlet.Timeout(10, False):
+ m = pattern.search(v)
+ if m:
+ return m
+ else:
+ return None
+ #return re.compile(regex.split('\\;')[0], flags=re.I).search(v)
+
+
+ def contains_dict(self, d1, d2):
+ """Takes 2 dictionaries
+
+ Returns True if d1 contains all items in d2"""
+ for k2, v2 in d2.items():
+ v1 = d1.get(k2)
+ if v1:
+ if not self.contains(v1, v2):
+ return False
+ else:
+ return False
+ return True
+
+if __name__ == '__main__':
+ opt = optparse.OptionParser(usage = "%prog [-u] or %prog [-l]", version = "%prog 1.0")
+ opt.add_option('-u', '--url',
+ dest = '_url',
+ help = 'Detect the technologies used by a single website',
+ default = None,
+ type = 'string')
+ opt.add_option('-l', '--list',
+ dest = '_list',
+ help = 'Detect the technologies used by a group of websites',
+ default = None,
+ type = 'string')
+ opt.add_option('-s', '--save',
+ dest = '_save',
+ help = 'save result',
+ default = None,
+ type = 'string')
+ (options, args) = opt.parse_args()
+
+ sp = Spider()
+ if options._save:
+ if options._url:
+ sp.one_query(options._url, options._save)
+ if options._list:
+ sp.group_query(options._list, options._save)
+ else:
+ if options._url:
+ sp.one_query(options._url, None)
+ if options._list:
+ sp.group_query(options._list, None)
+
+