1 files changed, 64 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..58ab148
--- /dev/null
+++ b/main.py
@@ -0,0 +1,64 @@
+# 输入一系列pcap和pcap对应的标签，输出一个dataframe格式的csv
+from time import time
+from configparser import ConfigParser
+from multiprocessing import cpu_count
+import pandas as pd
+import sys
+import os
+from concurrent.futures import ThreadPoolExecutor
+from threading import Lock
+import re
+
+lock = Lock()
+pcap_name_complier = re.compile(".*/(?P<domain>.*)_(?P<doh_uri>.*)_(?P<browser>.*)_(?P<time>.*)\.pcap")
+
+
+def multi_write(input_filename, id):
+    match_result = pcap_name_complier.match(input_filename)
+    if match_result:
+        domain = match_result["domain"]
+        doh_uri = match_result["doh_uri"]
+        browser = match_result["browser"]
+    else:
+        domain = None
+        doh_uri = None
+        browser = None
+    lengths, time_lags, directions = trans_pcap_to_row(input_filename)
+    if len(lengths) == 0 or sum(time_lags) < 1e-7:
+        return
+    flow_serial = [lengths[i] * directions[i] for i in range(len(lengths))]
+    time_serial = [time_lags[i] * directions[i] for i in range(len(time_lags))]
+    features = extract_statistic_feature(lengths, time_lags, directions)
+
+    lock.acquire()
+    df.loc[id] = [domain, doh_uri, browser, flow_serial, time_serial, features]
+    lock.release()
+
+
+def run(input_filepath):
+    files = os.listdir(input_filepath)
+    with ThreadPoolExecutor(max_workers=30) as thread_pool:
+        id = 0
+        for i in range(len(files)):
+            file = files[i]
+            full_file = os.path.join(input_filepath, file)
+            if not full_file.endswith(".pcap"):
+                continue
+            # multi_write(full_file, id)
+            thread_pool.submit(multi_write, full_file, id)
+            id += 1
+            print(id)
+
+
+if __name__ == "__main__":
+    # input_filepath = sys.argv[1]
+    # print(input_filepath)
+    # output_filename = sys.argv[2]
+    input_filepath = "E:\\doh&web\\201102\\win10\\web_traffic_after_predeal"
+    output_filename = "./result/web_feature_2.csv"
+    start_time = time()
+    df = pd.DataFrame(columns=["domain", "doh_uri", "browser", "flow_serial", "time_serial", "features"])
+    run(input_filepath)
+    df.to_csv(output_filename)
+    end_time = time()
+    print("总耗时：", end_time - start_time)