utils/downloadMTADataset.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

"""
Date: 2022-03-01
Author: [email protected]
Desc: download all the pcaps from https://malware-traffic-analysis.net and unzip them
"""
import requests
from lxml import etree
import re
import os


def getAllUrls():
    """
    Desc: get all the pcaps url from target website
    :return: a list of the data url
    """
    # Different target urls in different years
    targetUrl = "https://www.malware-traffic-analysis.net/2022/index.html"
    response = requests.get(targetUrl)
    if response.status_code != 200:
        raise "Connection Error"
    html = etree.HTML(response.text)
    results = []
    yearsUls = html.xpath('//*[@id="main_content"]/div[1]/ul')
    for i in range(1, len(yearsUls) + 1):
        yearDetailsUrls = html.xpath('//*[@id="main_content"]/div[1]/ul[{}]/li'.format(i))
        for j in range(1, len(yearDetailsUrls) + 1):
            url = html.xpath('//*[@id="main_content"]/div[1]/ul[{}]/li[{}]/a[1]/@href'.format(i, j))[0]
            if 'isc.sans.edu' in url:
                continue
            url = "https://malware-traffic-analysis.net/" + targetUrl.split("/")[-2] + "/" + url
            results.append(url)
    return results


def getAPcap(url):
    """
    get pcap file url, use wget to download the file and unzip it
    :param url: pcap file web url
    :return:
    """
    response = requests.get(url)
    if response.status_code != 200:
        raise "Connection Error"
    html = response.text
    pcapLinks = re.findall(r'<a\sclass="menu_link"\shref="([a-zA-Z0-9\-]+)\.pcap\.zip">.*?pcap\.zip</a>', html, re.S)
    for item in pcapLinks:
        url_ = "/".join(url.split("/")[:-1])
        downloadLink = url_ + "/" + item + ".pcap.zip"
        # wget download file
        if os.path.exists("/home/sunhanwu/datasets/MTA/pcaps2/" + downloadLink.split('/')[-1]):
            continue
        wget = "wget -P /home/sunhanwu/datasets/MTA/pcaps2/ " + downloadLink
        print(wget)
        os.system(wget)

        # unzip file
        unzip = "unzip -P infected /home/sunhanwu/datasets/MTA/pcaps2/" + downloadLink.split('/')[-1] + " -d /home/sunhanwu/datasets/MTA/pcaps2/"
        print(unzip)
        os.system(unzip)


if __name__ == '__main__':
    urls = getAllUrls()
    for url in urls:
        print(url)
        getAPcap(url)