main.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

# 输入一系列pcap和pcap对应的标签，输出一个dataframe格式的csv
from time import time
from configparser import ConfigParser
from multiprocessing import cpu_count
import pandas as pd
import sys
import os
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
import re

lock = Lock()
pcap_name_complier = re.compile(".*/(?P<domain>.*)_(?P<doh_uri>.*)_(?P<browser>.*)_(?P<time>.*)\.pcap")


def multi_write(input_filename, id):
    match_result = pcap_name_complier.match(input_filename)
    if match_result:
        domain = match_result["domain"]
        doh_uri = match_result["doh_uri"]
        browser = match_result["browser"]
    else:
        domain = None
        doh_uri = None
        browser = None
    lengths, time_lags, directions = trans_pcap_to_row(input_filename)
    if len(lengths) == 0 or sum(time_lags) < 1e-7:
        return
    flow_serial = [lengths[i] * directions[i] for i in range(len(lengths))]
    time_serial = [time_lags[i] * directions[i] for i in range(len(time_lags))]
    features = extract_statistic_feature(lengths, time_lags, directions)

    lock.acquire()
    df.loc[id] = [domain, doh_uri, browser, flow_serial, time_serial, features]
    lock.release()


def run(input_filepath):
    files = os.listdir(input_filepath)
    with ThreadPoolExecutor(max_workers=30) as thread_pool:
        id = 0
        for i in range(len(files)):
            file = files[i]
            full_file = os.path.join(input_filepath, file)
            if not full_file.endswith(".pcap"):
                continue
            # multi_write(full_file, id)
            thread_pool.submit(multi_write, full_file, id)
            id += 1
            print(id)


if __name__ == "__main__":
    # input_filepath = sys.argv[1]
    # print(input_filepath)
    # output_filename = sys.argv[2]
    input_filepath = "E:\\doh&web\\201102\\win10\\web_traffic_after_predeal"
    output_filename = "./result/web_feature_2.csv"
    start_time = time()
    df = pd.DataFrame(columns=["domain", "doh_uri", "browser", "flow_serial", "time_serial", "features"])
    run(input_filepath)
    df.to_csv(output_filename)
    end_time = time()
    print("总耗时：", end_time - start_time)