Upload New File

author: 马泽宇 <[email protected]> 2019-04-04 14:37:25 +0800
committer: 马泽宇 <[email protected]> 2019-04-04 14:37:25 +0800
commit: 425137e03fe73d3d65aa94164f1fdfee3d7c530d (patch)
tree: 539cf7343ba8cb79cbb39b8bc4cf83ebd97e4688
parent: c2e8089398fbefcca30b2e208d5abc67d55de2e8 (diff)
1 files changed, 279 insertions, 0 deletions
diff --git a/fingerprint.py b/fingerprint.py
new file mode 100644
index 0000000..49b01e5
--- /dev/null
+++ b/fingerprint.py
@@ -0,0 +1,279 @@
+# -*- coding=UTF-8 -*-
+
+import optparse
+import urllib.request
+import urllib.parse
+import ssl
+import re
+import os
+import time
+import pymysql
+import json
+import eventlet
+from bs4 import BeautifulSoup
+from lxml import etree
+
+class Spider():
+
+	def __init__(self):
+		"""builtwith, similartech and wappalyzer results"""
+		self.b_tech = [] 
+		self.s_tech = []
+		self.wapp_tech = {}
+		self.sum_tech = []
+
+		"""final result"""
+		self.output = {}
+
+		"""url list""" 
+		self.urls = []
+
+		"""load apps.json.py"""
+		self.apps = self.load_apps()
+		pass
+	def part_init(self):
+		self.b_tech = []
+		self.s_tech = []
+		self.wapp_tech = {}
+		self.sum_tech = []
+		self.output = {}
+		pass
+
+	def req(self, url, tag):  #download html source code
+		header = {
+			'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
+		}
+		context = ssl._create_unverified_context()
+		request = urllib.request.Request(url, headers = header)
+		try:
+			response = urllib.request.urlopen(request, timeout = 30, context = context)
+			html = response.read()
+			headers = response.headers
+		except Exception as e:
+			#print("Error:", e)
+			return
+		if tag == 2:
+			self.wapp(url, headers, html)
+		else:
+			self.extract(html, tag)
+		pass
+
+	def extract(self, html, tag): #extract datas from html
+		html = etree.HTML(html)#, etree.HTMLParser())
+		if tag:
+			self.b_tech = html.xpath('//div[@class="row mb-2 mt-2"]/div[@class="col-12"]/h2/a[@class="text-dark"]/text()')
+			for i in range(len(self.b_tech)):
+				self.b_tech[i] = self.b_tech[i].strip(' ')
+			#print(self.b_tech)
+		else:
+			self.s_tech = html.xpath('//div[@class="col-md-9"]/div[@class="item-title"]/a[@class="tech-name"]/text()')
+			for i in range(len(self.s_tech)):
+				self.s_tech[i] = self.s_tech[i].strip(' ')
+			#print(self.s_tech)
+		pass
+
+	def one_query(self, url, save_tag):
+		sp.req("https://builtwith.com/" + url, 1)
+		sp.req("https://www.similartech.com/websites/" + url, 0)
+		sp.req("http://" + url, 2)
+		self.database(url, save_tag)
+		pass
+
+	def group_query(self, filename, save_tag):
+		self.loadlist(filename)
+		for url in self.urls:
+			self.one_query(url, save_tag)
+		pass
+
+	def sum(self):
+		self.output['othertech'] = []
+		sum_tech = self.b_tech + self.s_tech
+		for key in self.wapp_tech:
+			for tech in self.wapp_tech[key]:
+				sum_tech.append(tech)
+		self.sum_tech = sorted(set(sum_tech), key = sum_tech.index)
+		pass
+
+	def database(self, url, save_tag):
+		self.sum()
+		config = {
+			"host":"localhost",
+			"user":"root",
+			"password":"xiaoshitou",
+			"database":"Fingerprint"
+		}
+		try:
+			db = pymysql.connect(**config)
+		except:
+			print("connection failed")
+		cursor = db.cursor(cursor = pymysql.cursors.DictCursor)
+		sql = "SELECT * from TECH_1 WHERE name=%s"
+		for tech in self.sum_tech:
+			cursor.execute(sql,(tech))
+			techs = cursor.fetchone()
+			if techs:
+				if techs['subcategory']:
+					if not self.output.__contains__(techs['subcategory']):
+						self.output[techs['subcategory']] = [techs['name']]
+					else:
+						self.output[techs['subcategory']].append(techs['name'])
+				else:
+					if not self.output.__contains__(techs['category']):
+						self.output[techs['category']] = [techs['name']]
+					else:
+						self.output[techs['category']].append(techs['name'])
+			else:
+				self.output['othertech'].append(tech)
+		print(url,':',self.output)
+		self.save_result(url, save_tag)
+		cursor.close()
+		db.close()
+
+	def loadlist(self, filename):
+		with open(filename, 'r') as f:
+			urls = f.read()
+			urls = urls.split('\n')
+		self.urls = urls
+		pass
+
+	def save_result(self, url, save_tag):
+		if save_tag:
+			with open(save_tag, 'a') as f:
+				f.write(url + '\n')
+				f.write(str(self.output) + '\n')
+				f.close()
+		self.part_init()
+		pass
+
+	def load_apps(self, filename = 'apps.json.py'):
+		filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
+		return json.load(open(filename))
+
+	def wapp(self, url, headers, html):
+		for app_name, app_spec in self.apps['apps'].items():
+			#print(app_spec)
+			if 'url' in app_spec:
+				if self.contains(url, app_spec['url']):
+					self.add_app(app_name, app_spec)
+		
+		if headers:
+			for app_name, app_spec in self.apps['apps'].items():
+				if 'headers' in app_spec:
+					if self.contains_dict(headers, app_spec['headers']):
+						self.add_app(app_name, app_spec)
+
+		if html:
+			for app_name, app_spec in self.apps['apps'].items():
+				for key in 'html', 'script':
+					snippets = app_spec.get(key, [])
+					if not isinstance(snippets, list):
+						snippets = [snippets]
+					for snippet in snippets:
+						if self.contains(html, snippet):
+							self.add_app(app_name, app_spec)
+							break
+
+			if isinstance(html, bytes):
+				try:
+					html = html.decode('utf8', 'ignore')
+				except:
+					html = html.decode('gbk')
+			metas = dict(re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE).findall(html))
+			for app_name, app_spec in self.apps['apps'].items():
+				for name, content in app_spec.get('meta', {}).items():
+					if name in metas:
+						if self.contains(metas[name], content):
+							self.add_app(app_name, app_spec)
+							break
+		#print(self.wapp_tech)
+
+	def add_app(self, app_name, app_spec):
+		"""Add this app to technology
+		"""
+		for category in self.get_categories(app_spec):
+			if category not in self.wapp_tech:
+				self.wapp_tech[category] = []
+			if app_name not in self.wapp_tech[category]:
+				self.wapp_tech[category].append(app_name)
+				implies = app_spec.get('implies', [])
+				if not isinstance(implies, list):
+					implies = [implies]
+				for app_name in implies:
+					app_name = app_name.split('\\;')
+					app_name = app_name[0]
+					self.add_app(app_name, self.apps['apps'][app_name])
+           
+
+	def get_categories(self, app_spec):
+		"""Return category names for this app_spec
+		"""
+		return [self.apps['categories'][str(c_id)] for c_id in app_spec['cats']]
+
+
+	def contains(self, v, regex):
+		"""Removes meta data from regex then checks for a regex match
+		"""
+		#print(regex.split('\\;')[0])
+		if isinstance(v, bytes):
+			try:
+				v = v.decode('utf8', 'ignore')
+			except:
+				v = v.decode('gbk')
+		regex = regex.split('\\;')[0]
+		pattern = re.compile(regex, re.I)
+		eventlet.monkey_patch()
+		#print(regex)
+		with eventlet.Timeout(10, False):
+			m = pattern.search(v)
+		if m:
+			return m
+		else:
+			return None
+		#return re.compile(regex.split('\\;')[0], flags=re.I).search(v)
+
+
+	def contains_dict(self, d1, d2):
+		"""Takes 2 dictionaries
+		
+		Returns True if d1 contains all items in d2"""
+		for k2, v2 in d2.items():
+			v1 = d1.get(k2)
+			if v1:
+				if not self.contains(v1, v2):
+					return False
+			else:
+				return False
+		return True
+
+if __name__ == '__main__':
+	opt = optparse.OptionParser(usage = "%prog [-u] or %prog [-l]", version = "%prog 1.0")
+	opt.add_option('-u', '--url',
+					dest = '_url',
+					help = 'Detect the technologies used by a single website',
+					default = None,
+					type = 'string')
+	opt.add_option('-l', '--list',
+					dest = '_list',
+					help = 'Detect the technologies used by a group of websites',
+					default = None,
+					type = 'string')
+	opt.add_option('-s', '--save',
+					dest = '_save',
+					help = 'save result',
+					default = None,
+					type = 'string')
+	(options, args) = opt.parse_args()
+
+	sp = Spider()
+	if options._save:
+		if options._url:
+			sp.one_query(options._url, options._save)
+		if options._list:
+			sp.group_query(options._list, options._save)
+	else:
+		if options._url:
+			sp.one_query(options._url, None)
+		if options._list:
+			sp.group_query(options._list, None)
+
+
author	马泽宇 <[email protected]>	2019-04-04 14:37:25 +0800
committer	马泽宇 <[email protected]>	2019-04-04 14:37:25 +0800
commit	425137e03fe73d3d65aa94164f1fdfee3d7c530d (patch)
tree	539cf7343ba8cb79cbb39b8bc4cf83ebd97e4688
parent	c2e8089398fbefcca30b2e208d5abc67d55de2e8 (diff)