diff options
| author | lyf <[email protected]> | 2023-03-17 16:01:04 +0800 |
|---|---|---|
| committer | lyf <[email protected]> | 2023-03-17 16:01:04 +0800 |
| commit | 202f58dc4e3800328a22bfee400c3c6322798998 (patch) | |
| tree | f1bbdf279ff67c48aa2a1ca93ae572f0a76e84c0 /getWebInfo.py | |
text commitmaster
Diffstat (limited to 'getWebInfo.py')
| -rw-r--r-- | getWebInfo.py | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/getWebInfo.py b/getWebInfo.py new file mode 100644 index 0000000..eef05a6 --- /dev/null +++ b/getWebInfo.py @@ -0,0 +1,176 @@ +import time +import requests +from urllib.parse import urlparse +from bs4 import BeautifulSoup +from multiprocessing import Process + + +def get_url_info(url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'} + try: + start = time.perf_counter() + r = requests.get(url, headers=headers, verify=False, timeout=10) + # print(r.status_code) + status = r.status_code + end = time.perf_counter() + delay = end - start # 网页加载时间 + html = r.text + # html = '' + except Exception as e: + status = 404 + html = '' + delay = float(300) + # print(gov) + + return status, html, delay + + +def get_url_info_dict(url): + status, html, delay = get_url_info(url) + url_info = {} + url_info['status_code'] = status + url_info['load_time'] = delay + url_info['pageSource'] = html + return url_info + + +def get_page_info(sub_url, current_url, homepage_domain, next_links): + url = sub_url # 下一级链接 + href = url.get("href") + print(href) + if str(href).endswith('.exe') or str(href).endswith('.pdf') or str(href).endswith('.xls'): + pass + else: + if str(href).startswith('http'): + # print(url.get_attribute("href")) + domain = urlparse(href).netloc.split(':', 1)[0] + if domain.endswith(homepage_domain): # 是子域名 + if domain not in next_links['domain']: + next_links['domain'].append(domain) + if str(href) not in next_links['urls'].keys(): + next_links['urls'][str(href)] = {} # 创建二级链接的相关信息,需要有状态码、时延、html源码 + next_links['urls'][str(href)] = get_url_info_dict(str(href)) + if domain not in next_links['AllDomain']: # 记录alldomain + next_links['AllDomain'].append(domain) + elif str(href).startswith('/'): # 一定是子域名链接 + domain = urlparse(current_url).netloc # 当前域名 首页是gov 二级是second——url + total_href = 'http://' + domain + str(href) # url是 + if str(total_href) not in next_links['urls'].keys(): + next_links['urls'][str(total_href)] = {} # 创建二级链接的相关信息,需要有状态码、时延、html源码 + next_links['urls'][str(total_href)] = get_url_info_dict(str(total_href)) + if domain not in next_links['AllDomain']: + next_links['AllDomain'].append(domain) + # print(href, domain, gov) + else: + pass + + return next_links + + +def get_third_link(homepage_domain, second_links): + third_links = {} + third_links['domain'] = [] # 3级链接的域名,与首页一致或者子域名 + third_links['AllDomain'] = [] # 所有3级链接中的域名 + third_links['urls'] = {} + for second_url in second_links['urls'].keys(): + # second_links['urls'][某具体的url][pagesource]里提取href就是三级链接,然后访问并记录url_info + page = second_links['urls'][second_url]['pageSource'] + pageSource = BeautifulSoup(page, 'html.parser') + # print(pageSource) + urls = pageSource.find_all('a', attrs={'href': True}) + for url in urls: # 每个二级链接里的所有三级链接 + third_links = get_page_info(url, second_url, homepage_domain, third_links) + # website_info['third_links'] = third_links + return third_links + + +def crawl_website(homepage): + # homepage = 'http://www.shandong.gov.cn/art/2022/3/14/art_116651_527850.html' + website_info = {} + website_info['homepage'] = homepage + status, htm, delay = get_url_info(homepage) + website_info['status_code'] = status + print(status) + if int(status) < 400: # 响应码小于400才做下一层操作 + pageSource = BeautifulSoup(htm, 'html.parser') + urls = pageSource.find_all('a', attrs={'href': True}) + # print(urls) + second_links = {} + second_links['domain'] = [] # 二级链接的域名,与首页一致或者子域名 + second_links['AllDomain'] = [] # 所有二级链接中的域名 + second_links['urls'] = {} # 二级链接及相关信息 + homepage_domain = urlparse(homepage).netloc.replace('www.', '') # 首页域名,如果二级链接中域名和首页域名endswith匹配,则是子域名或者完全相同 + # 从href中筛选下一层链接并记录相关信息 + for url in urls: + second_links = get_page_info(url, homepage, homepage_domain, second_links) + # website_info['load_time'] = delay + # website_info['pageSource'] = htm + website_info['second_links'] = second_links + website_info['third_links'] = get_third_link(homepage_domain, second_links) # + else: + # website_info['load_time'] = float(300) + # website_info['pageSource'] = '' + website_info['second_links'] = {} + website_info['third_links'] = {} + ''' + for url in website_info['second_links']['urls'].keys(): + if 'pageSource' in website_info['second_links']['urls'][url].keys(): + website_info['second_links']['urls'][url]['pageSource'] = '' + + for url in website_info['third_links']['urls'].keys(): + if 'pageSource' in website_info['third_links']['urls'][url].keys(): + website_info['third_links']['urls'][url]['pageSource'] = '' + ''' + import json + name = './result/' + str(urlparse(homepage).netloc) + '.json' # 存储文件路径,以网站域名命名的json文件 + with open(name, 'w', encoding='utf-8') as f: + json.dump(website_info, f, indent=1) + + +def get_webs(): + # 待检测的网页列表 + webs = [] + f = open('./data/websites.txt', 'r') # 读取的网站列表 + # txt存储格式: 每行一个网站首页网址或者域名 + for line in f: + line = line.strip('\n') + if line.startswith('http'): + pass + else: + line = 'http://' + line + webs.append(line) + f.close() + return webs + + +def crawl_websites(start, end, webs): + for i in range(start, end): + web = webs[i] + crawl_website(web) + + +if __name__ == '__main__': + # 可以直接爬指定网站,注释部分是使用多进程爬多个网站 + crawl_website('http://fgw.shandong.gov.cn') + ''' + webs = get_webs() # 获取待检测的网页列表,需要更换读取的文件夹 + p1 = Process(target=crawl_websites, args=(0,10, webs)) + p2 = Process(target=crawl_websites, args=(10,20, webs)) + p3 = Process(target=crawl_websites, args=(20, 30, webs)) + p4 = Process(target=crawl_websites, args=(30, 40, webs)) + p5 = Process(target=crawl_websites, args=(40, len(webs), webs)) + + p1.start() + p2.start() + p3.start() + p4.start() + p5.start() + + + p1.join() + p2.join() + p3.join() + p4.join() + p5.join() + '''
\ No newline at end of file |
