summaryrefslogtreecommitdiff
path: root/getWebInfo.py
diff options
context:
space:
mode:
Diffstat (limited to 'getWebInfo.py')
-rw-r--r--getWebInfo.py176
1 files changed, 176 insertions, 0 deletions
diff --git a/getWebInfo.py b/getWebInfo.py
new file mode 100644
index 0000000..eef05a6
--- /dev/null
+++ b/getWebInfo.py
@@ -0,0 +1,176 @@
+import time
+import requests
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+from multiprocessing import Process
+
+
+def get_url_info(url):
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
+ try:
+ start = time.perf_counter()
+ r = requests.get(url, headers=headers, verify=False, timeout=10)
+ # print(r.status_code)
+ status = r.status_code
+ end = time.perf_counter()
+ delay = end - start # 网页加载时间
+ html = r.text
+ # html = ''
+ except Exception as e:
+ status = 404
+ html = ''
+ delay = float(300)
+ # print(gov)
+
+ return status, html, delay
+
+
+def get_url_info_dict(url):
+ status, html, delay = get_url_info(url)
+ url_info = {}
+ url_info['status_code'] = status
+ url_info['load_time'] = delay
+ url_info['pageSource'] = html
+ return url_info
+
+
+def get_page_info(sub_url, current_url, homepage_domain, next_links):
+ url = sub_url # 下一级链接
+ href = url.get("href")
+ print(href)
+ if str(href).endswith('.exe') or str(href).endswith('.pdf') or str(href).endswith('.xls'):
+ pass
+ else:
+ if str(href).startswith('http'):
+ # print(url.get_attribute("href"))
+ domain = urlparse(href).netloc.split(':', 1)[0]
+ if domain.endswith(homepage_domain): # 是子域名
+ if domain not in next_links['domain']:
+ next_links['domain'].append(domain)
+ if str(href) not in next_links['urls'].keys():
+ next_links['urls'][str(href)] = {} # 创建二级链接的相关信息,需要有状态码、时延、html源码
+ next_links['urls'][str(href)] = get_url_info_dict(str(href))
+ if domain not in next_links['AllDomain']: # 记录alldomain
+ next_links['AllDomain'].append(domain)
+ elif str(href).startswith('/'): # 一定是子域名链接
+ domain = urlparse(current_url).netloc # 当前域名 首页是gov 二级是second——url
+ total_href = 'http://' + domain + str(href) # url是
+ if str(total_href) not in next_links['urls'].keys():
+ next_links['urls'][str(total_href)] = {} # 创建二级链接的相关信息,需要有状态码、时延、html源码
+ next_links['urls'][str(total_href)] = get_url_info_dict(str(total_href))
+ if domain not in next_links['AllDomain']:
+ next_links['AllDomain'].append(domain)
+ # print(href, domain, gov)
+ else:
+ pass
+
+ return next_links
+
+
+def get_third_link(homepage_domain, second_links):
+ third_links = {}
+ third_links['domain'] = [] # 3级链接的域名,与首页一致或者子域名
+ third_links['AllDomain'] = [] # 所有3级链接中的域名
+ third_links['urls'] = {}
+ for second_url in second_links['urls'].keys():
+ # second_links['urls'][某具体的url][pagesource]里提取href就是三级链接,然后访问并记录url_info
+ page = second_links['urls'][second_url]['pageSource']
+ pageSource = BeautifulSoup(page, 'html.parser')
+ # print(pageSource)
+ urls = pageSource.find_all('a', attrs={'href': True})
+ for url in urls: # 每个二级链接里的所有三级链接
+ third_links = get_page_info(url, second_url, homepage_domain, third_links)
+ # website_info['third_links'] = third_links
+ return third_links
+
+
+def crawl_website(homepage):
+ # homepage = 'http://www.shandong.gov.cn/art/2022/3/14/art_116651_527850.html'
+ website_info = {}
+ website_info['homepage'] = homepage
+ status, htm, delay = get_url_info(homepage)
+ website_info['status_code'] = status
+ print(status)
+ if int(status) < 400: # 响应码小于400才做下一层操作
+ pageSource = BeautifulSoup(htm, 'html.parser')
+ urls = pageSource.find_all('a', attrs={'href': True})
+ # print(urls)
+ second_links = {}
+ second_links['domain'] = [] # 二级链接的域名,与首页一致或者子域名
+ second_links['AllDomain'] = [] # 所有二级链接中的域名
+ second_links['urls'] = {} # 二级链接及相关信息
+ homepage_domain = urlparse(homepage).netloc.replace('www.', '') # 首页域名,如果二级链接中域名和首页域名endswith匹配,则是子域名或者完全相同
+ # 从href中筛选下一层链接并记录相关信息
+ for url in urls:
+ second_links = get_page_info(url, homepage, homepage_domain, second_links)
+ # website_info['load_time'] = delay
+ # website_info['pageSource'] = htm
+ website_info['second_links'] = second_links
+ website_info['third_links'] = get_third_link(homepage_domain, second_links) #
+ else:
+ # website_info['load_time'] = float(300)
+ # website_info['pageSource'] = ''
+ website_info['second_links'] = {}
+ website_info['third_links'] = {}
+ '''
+ for url in website_info['second_links']['urls'].keys():
+ if 'pageSource' in website_info['second_links']['urls'][url].keys():
+ website_info['second_links']['urls'][url]['pageSource'] = ''
+
+ for url in website_info['third_links']['urls'].keys():
+ if 'pageSource' in website_info['third_links']['urls'][url].keys():
+ website_info['third_links']['urls'][url]['pageSource'] = ''
+ '''
+ import json
+ name = './result/' + str(urlparse(homepage).netloc) + '.json' # 存储文件路径,以网站域名命名的json文件
+ with open(name, 'w', encoding='utf-8') as f:
+ json.dump(website_info, f, indent=1)
+
+
+def get_webs():
+ # 待检测的网页列表
+ webs = []
+ f = open('./data/websites.txt', 'r') # 读取的网站列表
+ # txt存储格式: 每行一个网站首页网址或者域名
+ for line in f:
+ line = line.strip('\n')
+ if line.startswith('http'):
+ pass
+ else:
+ line = 'http://' + line
+ webs.append(line)
+ f.close()
+ return webs
+
+
+def crawl_websites(start, end, webs):
+ for i in range(start, end):
+ web = webs[i]
+ crawl_website(web)
+
+
+if __name__ == '__main__':
+ # 可以直接爬指定网站,注释部分是使用多进程爬多个网站
+ crawl_website('http://fgw.shandong.gov.cn')
+ '''
+ webs = get_webs() # 获取待检测的网页列表,需要更换读取的文件夹
+ p1 = Process(target=crawl_websites, args=(0,10, webs))
+ p2 = Process(target=crawl_websites, args=(10,20, webs))
+ p3 = Process(target=crawl_websites, args=(20, 30, webs))
+ p4 = Process(target=crawl_websites, args=(30, 40, webs))
+ p5 = Process(target=crawl_websites, args=(40, len(webs), webs))
+
+ p1.start()
+ p2.start()
+ p3.start()
+ p4.start()
+ p5.start()
+
+
+ p1.join()
+ p2.join()
+ p3.join()
+ p4.join()
+ p5.join()
+ ''' \ No newline at end of file