script/analyze_packet_trace.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456

#!/usr/bin/env python

import os
from scapy.all import *
from scapy.error import Scapy_Exception
from utils import MyPacket
from urllib.parse import urlparse, parse_qs
import re
# Error code table
ERR_NO_TCP_PKT = -2
ERR_PCAP_MALFORMAT = -3
ERR_NO_SK_STATE_FOUND = -4
ERR_INVALID_SK_STATE = -5
ERR_NUM_PKTS_MISMATCH = -6
ERR_UNKNOWN_ERR = -7
ERR_NO_PCAP_ID_FOUND = -8
ERR_NO_TWO_WAY_COMM = -9
ERR_NO_ESTABLISHED = -10
ERR_REPLAY = -11
ERR_MORE_THAN_TWO_ENDPOINTS = -12

TCP_STATE_SYN_NOT_DETECTED = "SYN_NOT_DETECTED"
TCP_STATE_NOT_TCP_PKT = "NOT_TCP_PKT"
TCP_STATE_UNKNOWN_ERR = "ERR_UNKNOWN"
TCP_STATE_ESTABLIAHSED = "ESTABLISHED"
TCP_SUBSTATE_EMPTY = "EMPTY_STATE"
REPLAY_ERR = "ERR_REPLAY"

ALL_TCP_OPT_SET = set()
ALL_IP_OPT_SET = set()

#获取给定路径下的文件名，False表示文件列表不会按照时间顺序排序
def get_filenames(path, time_order=False):
    filenames = os.listdir(path)
    if time_order:
        fnames_w_timestamp = {}
        for fname in filenames:
            pcap_fname = fname.strip(".pcap")
            date, time, num_packet = pcap_fname.split("_")[-3:]
            timestamp = int(date + time + num_packet)
            fnames_w_timestamp[fname] = timestamp
        sorted_fnames = [k for k, v in sorted(
            fnames_w_timestamp.items(), key=lambda x: x[1])]
        return sorted_fnames
    else:
        return filenames

#该函数的作用是将SymTCP数据集中的TCP状态映射文件解析为一个嵌套字典，其中每个攻击对应一个子字典，子字典的键是攻击数据的ID，值是对应的TCP状态。
def read_symtcp_tcp_state_mapping_file(fpath):
    sk_mapping = {}
    with open(fpath) as fin:
        data = fin.readlines()
    for line in data:
        line = line.strip("\n")
        src_ip, src_port, dst_ip, dst_port, seq, ack = line.split(",")[:6]
        dataoff, flags, window, chksum, urgptr = line.split(",")[-6:-1]
        attack_id = ','.join([src_ip, src_port, dst_ip, dst_port])
        attack_packet_id = ','.join([dataoff, flags, window, chksum, urgptr])
        reverse_attack_id = ','.join([dst_ip, dst_port, src_ip, src_port])
        sk_state = line.split(",")[-1]
        if attack_id in sk_mapping:
            sk_mapping[attack_id][attack_packet_id] = sk_state
        else:
            sk_mapping[attack_id] = {attack_packet_id: sk_state}
        if reverse_attack_id in sk_mapping:
            sk_mapping[reverse_attack_id][attack_packet_id] = sk_state
        else:
            sk_mapping[reverse_attack_id] = {attack_packet_id: sk_state}
    return sk_mapping

#该函数的作用是将WAMI数据集中的TCP状态映射文件解析为一个字典，其中每个PCAP文件对应一个TCP状态列表。字典的键是PCAP文件的ID，值是对应的TCP状态列表。
def read_wami_tcp_state_mapping_file(fpath):
    tcp_state_mapping = {}
    with open(fpath) as fin:
        data = fin.readlines()#读取所有行
    for line in data:
        line = line.strip("\n")
        states = line.split(',')
        pcap_id = states[0]
        del states[0]
        tcp_state_mapping[pcap_id] = states#每个连接pcap包的状态集和
    return tcp_state_mapping


def find_sk_state(attack_id, sk_mapping, debug=False):
    if debug:
        print(attack_id)
        for k, v in sk_mapping.items():
            input(str(k) + str(v))
    try:
        sk_state = sk_mapping[attack_id]
        if debug:
            input("[DEBUG] Attack ID %s is in sk_state %s" %
                  (attack_id), str(sk_state))
        return sk_mapping[attack_id]
    except KeyError:
        if debug:
            input("[ERROR] attack_id %s not found!" % str(attack_id))
        return ERR_NO_SK_STATE_FOUND


def find_wami_tcp_pkt_state(attack_id, sk_mapping, debug=False):
    if debug:
        print(attack_id)
        for k, v in sk_mapping.items():
            input(str(k) + str(v))
    try:
        pcap_id, pkt_idx = attack_id.split(',')
        pkt_idx = int(pkt_idx)
        states = sk_mapping[pcap_id]
        state = states[pkt_idx]
        if debug:
            input("[DEBUG] Attack ID %s has sk_state %s" %
                  (attack_id), str(state))
        return state
    except KeyError:
        if debug:
            input("[ERROR] attack_id %s not found!" % str(attack_id))
        return ERR_NO_SK_STATE_FOUND


def verify_if_tcp_checksum_is_correct(pkt, debug=False):
    original_chksum = pkt[TCP].chksum
    del pkt[TCP].chksum
    pkt = pkt.__class__(bytes(pkt))
    scapy_chksum = pkt[TCP].chksum
    pkt[TCP].chksum = original_chksum
    if debug:
        input("Original: %s; recalculated: %s" %
              (str(original_chksum), str(scapy_chksum)))
    if original_chksum == scapy_chksum:
        return True
    else:
        input("%s, %s" % (original_chksum, scapy_chksum))
        print("[INFO] Wrong chceksum!")
        return False


def verify_if_ip_checksum_is_correct(pkt, debug=False):
    original_chksum = pkt[IP].chksum
    del pkt[IP].chksum
    pkt = pkt.__class__(bytes(pkt))
    scapy_chksum = pkt[IP].chksum
    pkt[IP].chksum = original_chksum
    if debug:
        input("Original: %s; recalculated: %s" %
              (str(original_chksum), str(scapy_chksum)))
    if original_chksum == scapy_chksum:
        return True
    else:
        input("[INFO] Wrong IP chceksum!")
        return False


def parse_pcap_file(pcap_file_path, sk_mapping, pcap_fname_type="symtcp", debug=False):#读取每个连接文件中的数据包特征
    global ALL_IP_OPT_SET, ALL_TCP_OPT_SET
    try:
        packets = rdpcap(pcap_file_path)
    except Exception as err:
        print("[ERROR] Pcap reading error: %s" % err)
        return ERR_PCAP_MALFORMAT

    pkt_trace = []

    pcap_fname = os.path.basename(pcap_file_path).replace(".pcap", "")

    # A bunch of checks to make sure the pcap is good
    if pcap_fname_type == "symtcp":
        date, time, num_packet = pcap_fname.split("_")[-3:]
        timestamp = date + time
    if pcap_fname_type == "wami":
        if sk_mapping:
            pcap_id = pcap_fname
            num_packet = "-1"
            timestamp = pcap_id
            if pcap_id not in sk_mapping:
                return ERR_NO_PCAP_ID_FOUND
            if len(packets) != len(sk_mapping[pcap_id]):
                return ERR_NUM_PKTS_MISMATCH
            if TCP_STATE_NOT_TCP_PKT in set(sk_mapping[pcap_id]):
                return ERR_NO_TCP_PKT
            if TCP_STATE_UNKNOWN_ERR in set(sk_mapping[pcap_id]):
                return ERR_UNKNOWN_ERR
            if REPLAY_ERR in set(sk_mapping[pcap_id]):
                return ERR_REPLAY
            if TCP_SUBSTATE_EMPTY in set(sk_mapping[pcap_id]):
                return ERR_INVALID_SK_STATE
            if len(packets) == 1:
                return -99
            if len(packets) > 200:
                return -50  # TODO: change to error code
        else:
            pcap_id = pcap_fname.strip('.pcap').split('_')[-1]
            num_packet = "-1"
            timestamp = pcap_id#表示为pcap的id

    # Let's then check if this pcap does contain two-way commnications.
    # If not, we should skip this pcap.
    src_tuple_set = set()
    dst_tuple_set = set()
    tuples_set = set()
    for pkt in packets:
        src_ip = pkt[IP].src
        src_port = pkt[TCP].sport
        src_tuple = ':'.join([src_ip, str(src_port)])
        dst_ip = pkt[IP].dst
        dst_port = pkt[TCP].dport
        dst_tuple = ':'.join([dst_ip, str(dst_port)])
        src_tuple_set.add(src_tuple)
        dst_tuple_set.add(dst_tuple)
        tuples_set.add(src_tuple)
        tuples_set.add(dst_tuple)
    if len(src_tuple_set) == len(dst_tuple_set) == 1:
        return ERR_NO_TWO_WAY_COMM
    if len(tuples_set) != 2:
        return ERR_MORE_THAN_TWO_ENDPOINTS

    for pkt_idx in range(len(packets)):
        pkt = packets[pkt_idx]
        if IP not in pkt or TCP not in pkt:
            continue

        # Let's process each packet
        src_ip = pkt[IP].src
        src_port = pkt[TCP].sport
        dst_ip = pkt[IP].dst
        dst_port = pkt[TCP].dport
        seq = pkt[TCP].seq
        ack = pkt[TCP].ack
        dataoff = pkt[TCP].dataofs*4
        flags = pkt[TCP].flags
        window = pkt[TCP].window
        arrival_timestamp = pkt.time
        tcp_timestamp = pkt[TCP].time
        ip_len = pkt[IP].len
        ip_ttl = pkt[IP].ttl
        ip_ihl = pkt[IP].ihl*4
        ip_id = pkt[IP].id
        ip_version = pkt[IP].version
        ip_tos = pkt[IP].tos
        ip_opt_non_standard = '0'
        ip_options = dict(pkt[IP].options)
        tcp_options = dict(pkt[TCP].options)
        ALL_IP_OPT_SET = ALL_IP_OPT_SET.union(set(list(ip_options.keys())))
        ALL_TCP_OPT_SET = ALL_TCP_OPT_SET.union(set(list(tcp_options.keys())))
        tcp_opt_mss = tcp_options['MSS'] if 'MSS' in tcp_options else '-1'
        tcp_opt_tsval = tcp_options['Timestamp'][0] if 'Timestamp' in tcp_options else '-1'
        tcp_opt_tsecr = tcp_options['Timestamp'][1] if 'Timestamp' in tcp_options else '-1'
        tcp_opt_wscale = tcp_options['WScale'] if 'WScale' in tcp_options else '-1'
        tcp_opt_uto = tcp_options['UserTimeout'] if 'UserTimeout' in tcp_options else '-1'
        # assuming this is correct
        tcp_opt_md5header = '0' if 'MD5header' in tcp_options else '-1'
        tcp_opt_non_standard = '0'
        # Kitsune-related features (for baseline)
        kitsune_frame_time_epoch = pkt.time
        kitsune_frame_len = len(pkt)
        kitsune_eth_src = pkt.src
        kitsune_eth_dst = pkt.dst
        kitsune_ip_src = pkt[IP].src
        kitsune_ip_dst = pkt[IP].dst
        kitsune_tcp_sport = pkt[TCP].sport
        kitsune_tcp_dport = pkt[TCP].dport
        if verify_if_ip_checksum_is_correct(pkt):
            ip_chksum = '0'
        else:
            ip_chksum = '1'

        chksum = '0'

        # This calc is based on the assumption that IP header is 20byte long
        payload_len = ip_len - ip_ihl - dataoff

        urgptr = pkt[TCP].urgptr

        if pcap_fname_type == "symtcp":
            attack_id = ','.join(
                [src_ip, str(src_port), dst_ip, str(dst_port)])
        if pcap_fname_type == "wami":
            attack_id = ','.join([pcap_id, str(pkt_idx)])
        if debug:
            print("[DEBUG] attack_id: %s" % attack_id)

        if pcap_fname_type == "symtcp":
            sk_state = find_sk_state(
                attack_id, sk_mapping, debug=False)
        if pcap_fname_type == "wami":
            if sk_mapping:
                sk_state = find_wami_tcp_pkt_state(
                    attack_id, sk_mapping, debug=False)
                if sk_state == TCP_STATE_SYN_NOT_DETECTED:
                    continue
                if sk_state == TCP_SUBSTATE_EMPTY:
                    return ERR_INVALID_SK_STATE
            else:
                sk_state = "DUMMY"
        #如果存在raw数据包，并且判断是HTTP报文，那么读取新的
        if pkt.haslayer('Raw') and (pkt[TCP].dport ==80):
            http_payload=pkt['Raw'].load.decode('utf-8','ignore')
            lines =http_payload.strip().split("\r\n")
            requestline = []
            http_header = []
            for i, line in enumerate(lines):
                if i==0 and '/' in line:
                    requestline.append(line)
                elif 'User-Agent' in line:
                    continue
                elif 'Accept' in line:
                    continue
                elif 'Cache-Control' in line:
                    continue
                elif 'Connection' in line:
                    continue
                elif 'Content-Length' in line:
                    continue
                elif 'Upgrade' in line:
                    continue
                elif 'X-Online-Host' in line:
                    continue
                elif 'Content-Type' in line:
                    continue
                else:
                    http_header.append(line)
            method = []
            path = []
            version = []
            param =[]
            header =[]
            if requestline:
                for line in requestline:
                    request=line.split(" ")
                    if len(request)==4:#存在空格的情况，统一归给method
                        method.append(request[0]+" ")
                        param.append(urlparse(request[2]).query)
                        path.append(urlparse(request[2]).path)
                        version.append(request[3])
                    else:
                        if len(request)>=1:
                            method.append(request[0])
                        if len(request)>=2:
                            path.append(urlparse(request[1]).path)
                            param.append(parse_qs(urlparse(request[1]).query))
                        if len(request)>=3:
                            version.append(request[2])
            if http_header:
                lineset=""
                for line in http_header:
                    lineset+=line
                header=lineset
                #if "www.youporn.com" in lineset:#如果和违禁关键字无关的话，我就不要了。因为只有host违禁才会在header上操作
                #    header=lineset.split("www.youporn.com")
            # Extracting anchor information from HTML content
            curr_pkt = MyPacket(src_ip, src_port, dst_ip,
                            dst_port, seq, ack, dataoff,
                            flags, window, chksum, urgptr,
                            timestamp, payload_len, sk_state,
                            pcap_fname, ip_len, ip_ttl, ip_ihl,
                            ip_chksum, ip_version, ip_tos, ip_id, ip_opt_non_standard,
                            tcp_opt_mss, tcp_opt_tsval, tcp_opt_tsecr,
                            tcp_opt_wscale, tcp_opt_uto, tcp_opt_md5header, tcp_opt_non_standard,
                            tcp_timestamp, arrival_timestamp,
                            kitsune_frame_time_epoch, kitsune_frame_len,
                            kitsune_eth_src, kitsune_eth_dst, kitsune_ip_src, kitsune_ip_dst, kitsune_tcp_sport,
                            kitsune_tcp_dport,method,version,
                            path,param,header)#如果还需要kitsune的话，那么通过
        else:
            curr_pkt = MyPacket(src_ip, src_port, dst_ip,
                            dst_port, seq, ack, dataoff,
                            flags, window, chksum, urgptr,
                            timestamp, payload_len, sk_state,
                            pcap_fname, ip_len, ip_ttl, ip_ihl,
                            ip_chksum, ip_version, ip_tos, ip_id, ip_opt_non_standard,
                            tcp_opt_mss, tcp_opt_tsval, tcp_opt_tsecr,
                            tcp_opt_wscale, tcp_opt_uto, tcp_opt_md5header, tcp_opt_non_standard,
                            tcp_timestamp, arrival_timestamp,
                            kitsune_frame_time_epoch, kitsune_frame_len,
                            kitsune_eth_src, kitsune_eth_dst, kitsune_ip_src, kitsune_ip_dst, kitsune_tcp_sport,
                            kitsune_tcp_dport)
        if debug:
            curr_pkt.print_debug()

        pkt_trace.append(curr_pkt)

    if len(pkt_trace) == 0:
        print("No TCP packet found!")
        return ERR_NO_TCP_PKT
    return pkt_trace


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description='Use this script to pre-process the raw datasets.')
    parser.add_argument('--pcap-dir', type=str,
                        help='Directory to collect packet trace files.')
    parser.add_argument('--sk-mapping-path', type=str,
                        help='Filename of sk_state mapping file.')
    parser.add_argument('--dataset-fpath', type=str, default='dummy_path',
                        help='Filename of dumped dataset file.')
    parser.add_argument('--kitsune-dataset-fpath', type=str, default='dummy_path',
                        help='Filename of dumped dataset file.')
    parser.add_argument('--dataset-type', type=str, default="symtcp",
                        help='Type of pcap dataset.')
    parser.add_argument('--debug', action='store_true',
                        help='Whether to dump the debugging information.')
    parser.add_argument('--use-dummy-state', action='store_true',
                        help='Whether to use dummy states.')
    parser.add_argument('--use-small-dataset', action='store_true')
    args = parser.parse_args()

    if args.dataset_type == "symtcp":
        pcap_filenames = get_filenames(args.pcap_dir, time_order=True)
    if args.dataset_type == "wami":
        pcap_filenames = get_filenames(args.pcap_dir, time_order=False)
    print("[INFO] Total number of pcap files: %d" % len(pcap_filenames))

    if args.use_dummy_state:
        sk_mapping = None
    else:
        if args.dataset_type == "symtcp":
            sk_mapping = read_symtcp_tcp_state_mapping_file(
                args.sk_mapping_path)
        elif args.dataset_type == "wami":
            sk_mapping = read_wami_tcp_state_mapping_file(args.sk_mapping_path)
            pcap_filenames = []
            for pcap_id, _ in sk_mapping.items():
                pcap_filenames.append('%s.pcap' % pcap_id)
        print("[INFO] Size of sk_state mapping: %d" % len(sk_mapping))

    pcap_filenames = sorted(pcap_filenames)
    attack_corpus = []
    cnt = 0

    if args.dataset_type == "wami":
        with open(args.dataset_fpath, 'w') as fout, open(args.kitsune_dataset_fpath, 'w') as fout2:
            fout.write(
                "ATTACK_ID;;PACKET_ID;;SRC_IP;;SRC_PORT;;DST_IP;;DST_PORT;;SEQ;;ACK;;DATAOFF;;FLAGS;;WINDOW;;CHKSUM;;URGPTR;;SK_STATE;;PAYLOAD_LEN;;PCAP_ID;;IP_LEN;;IP_TTL;;IP_IHL;;IP_CHKSUM;;IP_VERSION;;IP_TOS;;IP_ID;;IP_OPT_NON_STANDARD;;TCP_OPT_MSS;;TCP_OPT_TSVAL;;TCP_OPT_TSECR;;TCP_OPT_WSCALE;;TCP_OPT_UTO;;TCP_OPT_MD5HEADER;;TCP_OPT_NON_STANDARD;;TCP_TIMESTAMP;;ARRIVAL_TIMESTAMP;;HTTP_Method;;HTTP_Version;;HTTP_Path;;HTTP_Header;;HTTP_Params\n")
            fout2.write('\t'.join(['attack_id', 'packet_id', 'frame.time_epoch', 'frame.len', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport',
                                   'udp.dstport', 'icmp.type', 'icmp.code', 'arp.opcode', 'arp.src.hw_mac', 'arp.src.proto_ipv4', 'arp.dst.hw_mac', 'arp.dst.proto_ipv4', 'ipv6.src', 'ipv6.dst\n']))
            attack_count = 0
            for pcap_fname in pcap_filenames:
                cnt += 1
                print("[INFO] Processing pcap #%d w/ filename %s..." %
                      (cnt, pcap_fname))
                pcap_fpath = '/'.join([args.pcap_dir, pcap_fname])#按顺序读取每一个文件夹中的pcap文件
                pkt_trace = parse_pcap_file(
                    pcap_fpath, sk_mapping, pcap_fname_type='wami', debug=args.debug)
                if not isinstance(pkt_trace, int):
                    attack_packet_count = 0
                    for pkt in pkt_trace:
                        curr_pkt_str = pkt.get_data_str(
                            attack_count, attack_packet_count)
                        fout.write(curr_pkt_str + '\n')
                        fout2.write(pkt.get_kitsune_str(
                            attack_count, attack_packet_count) + '\n')
                        attack_packet_count += 1
                    attack_count += 1