text commitmaster

author: lyf <[email protected]> 2023-03-17 16:01:04 +0800
committer: lyf <[email protected]> 2023-03-17 16:01:04 +0800
commit: 202f58dc4e3800328a22bfee400c3c6322798998 (patch)
tree: f1bbdf279ff67c48aa2a1ca93ae572f0a76e84c0 /getWebInfo.py
1 files changed, 176 insertions, 0 deletions
diff --git a/getWebInfo.py b/getWebInfo.py
new file mode 100644
index 0000000..eef05a6
--- /dev/null
+++ b/getWebInfo.py
@@ -0,0 +1,176 @@
+import time
+import requests
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+from multiprocessing import Process
+
+
+def get_url_info(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
+    try:
+        start = time.perf_counter()
+        r = requests.get(url, headers=headers, verify=False, timeout=10)
+        # print(r.status_code)
+        status = r.status_code
+        end = time.perf_counter()
+        delay = end - start  # 网页加载时间
+        html = r.text
+        # html = ''
+    except Exception as e:
+        status = 404
+        html = ''
+        delay = float(300)
+        # print(gov)
+
+    return status, html, delay
+
+
+def get_url_info_dict(url):
+    status, html, delay = get_url_info(url)
+    url_info = {}
+    url_info['status_code'] = status
+    url_info['load_time'] = delay
+    url_info['pageSource'] = html
+    return url_info
+
+
+def get_page_info(sub_url, current_url, homepage_domain, next_links):
+    url = sub_url  # 下一级链接
+    href = url.get("href")
+    print(href)
+    if str(href).endswith('.exe') or str(href).endswith('.pdf') or str(href).endswith('.xls'):
+        pass
+    else:
+        if str(href).startswith('http'):
+            # print(url.get_attribute("href"))
+            domain = urlparse(href).netloc.split(':', 1)[0]
+            if domain.endswith(homepage_domain):  # 是子域名
+                if domain not in next_links['domain']:
+                    next_links['domain'].append(domain)
+                if str(href) not in next_links['urls'].keys():
+                    next_links['urls'][str(href)] = {}  # 创建二级链接的相关信息，需要有状态码、时延、html源码
+                    next_links['urls'][str(href)] = get_url_info_dict(str(href))
+            if domain not in next_links['AllDomain']:  # 记录alldomain
+                next_links['AllDomain'].append(domain)
+        elif str(href).startswith('/'):  # 一定是子域名链接
+            domain = urlparse(current_url).netloc  # 当前域名  首页是gov  二级是second——url
+            total_href = 'http://' + domain + str(href)  # url是
+            if str(total_href) not in next_links['urls'].keys():
+                next_links['urls'][str(total_href)] = {}  # 创建二级链接的相关信息，需要有状态码、时延、html源码
+                next_links['urls'][str(total_href)] = get_url_info_dict(str(total_href))
+            if domain not in next_links['AllDomain']:
+                next_links['AllDomain'].append(domain)
+                # print(href, domain, gov)
+        else:
+            pass
+
+    return next_links
+
+
+def get_third_link(homepage_domain, second_links):
+    third_links = {}
+    third_links['domain'] = []  # 3级链接的域名，与首页一致或者子域名
+    third_links['AllDomain'] = []  # 所有3级链接中的域名
+    third_links['urls'] = {}
+    for second_url in second_links['urls'].keys():
+        # second_links['urls'][某具体的url][pagesource]里提取href就是三级链接，然后访问并记录url_info
+        page = second_links['urls'][second_url]['pageSource']
+        pageSource = BeautifulSoup(page, 'html.parser')
+        # print(pageSource)
+        urls = pageSource.find_all('a', attrs={'href': True})
+        for url in urls:  # 每个二级链接里的所有三级链接
+            third_links = get_page_info(url, second_url, homepage_domain, third_links)
+    # website_info['third_links'] = third_links
+    return third_links
+
+
+def crawl_website(homepage):
+    # homepage = 'http://www.shandong.gov.cn/art/2022/3/14/art_116651_527850.html'
+    website_info = {}
+    website_info['homepage'] = homepage
+    status, htm, delay = get_url_info(homepage)
+    website_info['status_code'] = status
+    print(status)
+    if int(status) < 400:  # 响应码小于400才做下一层操作
+        pageSource = BeautifulSoup(htm, 'html.parser')
+        urls = pageSource.find_all('a', attrs={'href': True})
+        # print(urls)
+        second_links = {}
+        second_links['domain'] = []  # 二级链接的域名，与首页一致或者子域名
+        second_links['AllDomain'] = []  # 所有二级链接中的域名
+        second_links['urls'] = {}  # 二级链接及相关信息
+        homepage_domain = urlparse(homepage).netloc.replace('www.', '')  # 首页域名,如果二级链接中域名和首页域名endswith匹配，则是子域名或者完全相同
+        # 从href中筛选下一层链接并记录相关信息
+        for url in urls:
+            second_links = get_page_info(url, homepage, homepage_domain, second_links)
+        # website_info['load_time'] = delay
+        # website_info['pageSource'] = htm
+        website_info['second_links'] = second_links
+        website_info['third_links'] = get_third_link(homepage_domain, second_links)  #
+    else:
+        # website_info['load_time'] = float(300)
+        # website_info['pageSource'] = ''
+        website_info['second_links'] = {}
+        website_info['third_links'] = {}
+    '''
+    for url in website_info['second_links']['urls'].keys():
+        if 'pageSource' in website_info['second_links']['urls'][url].keys():
+            website_info['second_links']['urls'][url]['pageSource'] = ''
+
+    for url in website_info['third_links']['urls'].keys():
+        if 'pageSource' in website_info['third_links']['urls'][url].keys():
+            website_info['third_links']['urls'][url]['pageSource'] = ''
+    '''
+    import json
+    name = './result/' + str(urlparse(homepage).netloc) + '.json'  # 存储文件路径，以网站域名命名的json文件
+    with open(name, 'w', encoding='utf-8') as f:
+        json.dump(website_info, f, indent=1)
+
+
+def get_webs():
+    # 待检测的网页列表
+    webs = []
+    f = open('./data/websites.txt', 'r')  # 读取的网站列表
+    # txt存储格式： 每行一个网站首页网址或者域名
+    for line in f:
+        line = line.strip('\n')
+        if line.startswith('http'):
+            pass
+        else:
+            line = 'http://' + line
+        webs.append(line)
+    f.close()
+    return webs
+
+
+def crawl_websites(start, end, webs):
+    for i in range(start, end):
+        web = webs[i]
+        crawl_website(web)
+
+
+if __name__ == '__main__':
+    # 可以直接爬指定网站，注释部分是使用多进程爬多个网站
+    crawl_website('http://fgw.shandong.gov.cn')
+    '''
+    webs = get_webs() # 获取待检测的网页列表，需要更换读取的文件夹
+    p1 = Process(target=crawl_websites, args=(0,10, webs))
+    p2 = Process(target=crawl_websites, args=(10,20, webs))
+    p3 = Process(target=crawl_websites, args=(20, 30, webs))
+    p4 = Process(target=crawl_websites, args=(30, 40, webs))
+    p5 = Process(target=crawl_websites, args=(40, len(webs), webs))
+
+    p1.start()
+    p2.start()
+    p3.start()
+    p4.start()
+    p5.start()
+
+
+    p1.join()
+    p2.join()
+    p3.join()
+    p4.join()
+    p5.join()
+    '''
+\ No newline at end of file
author	lyf <[email protected]>	2023-03-17 16:01:04 +0800
committer	lyf <[email protected]>	2023-03-17 16:01:04 +0800
commit	202f58dc4e3800328a22bfee400c3c6322798998 (patch)
tree	f1bbdf279ff67c48aa2a1ca93ae572f0a76e84c0 /getWebInfo.py