summaryrefslogtreecommitdiff
path: root/getWebInfo.py
blob: eef05a6ff0db513b1d21ffd4e471828676a81ef9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import time
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from multiprocessing import Process


def get_url_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
    try:
        start = time.perf_counter()
        r = requests.get(url, headers=headers, verify=False, timeout=10)
        # print(r.status_code)
        status = r.status_code
        end = time.perf_counter()
        delay = end - start  # 网页加载时间
        html = r.text
        # html = ''
    except Exception as e:
        status = 404
        html = ''
        delay = float(300)
        # print(gov)

    return status, html, delay


def get_url_info_dict(url):
    status, html, delay = get_url_info(url)
    url_info = {}
    url_info['status_code'] = status
    url_info['load_time'] = delay
    url_info['pageSource'] = html
    return url_info


def get_page_info(sub_url, current_url, homepage_domain, next_links):
    url = sub_url  # 下一级链接
    href = url.get("href")
    print(href)
    if str(href).endswith('.exe') or str(href).endswith('.pdf') or str(href).endswith('.xls'):
        pass
    else:
        if str(href).startswith('http'):
            # print(url.get_attribute("href"))
            domain = urlparse(href).netloc.split(':', 1)[0]
            if domain.endswith(homepage_domain):  # 是子域名
                if domain not in next_links['domain']:
                    next_links['domain'].append(domain)
                if str(href) not in next_links['urls'].keys():
                    next_links['urls'][str(href)] = {}  # 创建二级链接的相关信息,需要有状态码、时延、html源码
                    next_links['urls'][str(href)] = get_url_info_dict(str(href))
            if domain not in next_links['AllDomain']:  # 记录alldomain
                next_links['AllDomain'].append(domain)
        elif str(href).startswith('/'):  # 一定是子域名链接
            domain = urlparse(current_url).netloc  # 当前域名  首页是gov  二级是second——url
            total_href = 'http://' + domain + str(href)  # url是
            if str(total_href) not in next_links['urls'].keys():
                next_links['urls'][str(total_href)] = {}  # 创建二级链接的相关信息,需要有状态码、时延、html源码
                next_links['urls'][str(total_href)] = get_url_info_dict(str(total_href))
            if domain not in next_links['AllDomain']:
                next_links['AllDomain'].append(domain)
                # print(href, domain, gov)
        else:
            pass

    return next_links


def get_third_link(homepage_domain, second_links):
    third_links = {}
    third_links['domain'] = []  # 3级链接的域名,与首页一致或者子域名
    third_links['AllDomain'] = []  # 所有3级链接中的域名
    third_links['urls'] = {}
    for second_url in second_links['urls'].keys():
        # second_links['urls'][某具体的url][pagesource]里提取href就是三级链接,然后访问并记录url_info
        page = second_links['urls'][second_url]['pageSource']
        pageSource = BeautifulSoup(page, 'html.parser')
        # print(pageSource)
        urls = pageSource.find_all('a', attrs={'href': True})
        for url in urls:  # 每个二级链接里的所有三级链接
            third_links = get_page_info(url, second_url, homepage_domain, third_links)
    # website_info['third_links'] = third_links
    return third_links


def crawl_website(homepage):
    # homepage = 'http://www.shandong.gov.cn/art/2022/3/14/art_116651_527850.html'
    website_info = {}
    website_info['homepage'] = homepage
    status, htm, delay = get_url_info(homepage)
    website_info['status_code'] = status
    print(status)
    if int(status) < 400:  # 响应码小于400才做下一层操作
        pageSource = BeautifulSoup(htm, 'html.parser')
        urls = pageSource.find_all('a', attrs={'href': True})
        # print(urls)
        second_links = {}
        second_links['domain'] = []  # 二级链接的域名,与首页一致或者子域名
        second_links['AllDomain'] = []  # 所有二级链接中的域名
        second_links['urls'] = {}  # 二级链接及相关信息
        homepage_domain = urlparse(homepage).netloc.replace('www.', '')  # 首页域名,如果二级链接中域名和首页域名endswith匹配,则是子域名或者完全相同
        # 从href中筛选下一层链接并记录相关信息
        for url in urls:
            second_links = get_page_info(url, homepage, homepage_domain, second_links)
        # website_info['load_time'] = delay
        # website_info['pageSource'] = htm
        website_info['second_links'] = second_links
        website_info['third_links'] = get_third_link(homepage_domain, second_links)  #
    else:
        # website_info['load_time'] = float(300)
        # website_info['pageSource'] = ''
        website_info['second_links'] = {}
        website_info['third_links'] = {}
    '''
    for url in website_info['second_links']['urls'].keys():
        if 'pageSource' in website_info['second_links']['urls'][url].keys():
            website_info['second_links']['urls'][url]['pageSource'] = ''

    for url in website_info['third_links']['urls'].keys():
        if 'pageSource' in website_info['third_links']['urls'][url].keys():
            website_info['third_links']['urls'][url]['pageSource'] = ''
    '''
    import json
    name = './result/' + str(urlparse(homepage).netloc) + '.json'  # 存储文件路径,以网站域名命名的json文件
    with open(name, 'w', encoding='utf-8') as f:
        json.dump(website_info, f, indent=1)


def get_webs():
    # 待检测的网页列表
    webs = []
    f = open('./data/websites.txt', 'r')  # 读取的网站列表
    # txt存储格式: 每行一个网站首页网址或者域名
    for line in f:
        line = line.strip('\n')
        if line.startswith('http'):
            pass
        else:
            line = 'http://' + line
        webs.append(line)
    f.close()
    return webs


def crawl_websites(start, end, webs):
    for i in range(start, end):
        web = webs[i]
        crawl_website(web)


if __name__ == '__main__':
    # 可以直接爬指定网站,注释部分是使用多进程爬多个网站
    crawl_website('http://fgw.shandong.gov.cn')
    '''
    webs = get_webs() # 获取待检测的网页列表,需要更换读取的文件夹
    p1 = Process(target=crawl_websites, args=(0,10, webs))
    p2 = Process(target=crawl_websites, args=(10,20, webs))
    p3 = Process(target=crawl_websites, args=(20, 30, webs))
    p4 = Process(target=crawl_websites, args=(30, 40, webs))
    p5 = Process(target=crawl_websites, args=(40, len(webs), webs))

    p1.start()
    p2.start()
    p3.start()
    p4.start()
    p5.start()


    p1.join()
    p2.join()
    p3.join()
    p4.join()
    p5.join()
    '''