diff options
Diffstat (limited to 'src/dataset_build/dataset_build.py')
| -rw-r--r-- | src/dataset_build/dataset_build.py | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/src/dataset_build/dataset_build.py b/src/dataset_build/dataset_build.py new file mode 100644 index 0000000..a832072 --- /dev/null +++ b/src/dataset_build/dataset_build.py @@ -0,0 +1,144 @@ +import re +import ConfigParser +import bisect +import random +import ctypes +import hashlib +import zlib +import binascii +import json +import datetime +import time + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==21), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), + 'not_null':(lambda x : len(x)!=0), + 'ysp_len':(lambda x : int(x)!=0), + 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} + +config = ConfigParser.RawConfigParser() +config.read("dataset_build.conf") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") +base_sfh_sets = config.get("file","base_sfh_sets") +breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +feature_list =[i for i in config.get("feature","feature_name").split(",")] +ll=ctypes.cdll.LoadLibrary +lib = ll("libmaatframe.so") +lost = dict() + + +class data_value(object): + + @staticmethod + def get_feature(data): + return_data=list() + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5]) + for x in xrange(1,21): + if(x==1): + if(term['not_null'](data_set[6])): + try: + time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8)) + data_set[6]=data_set[6][0:25] + time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S') + except Exception, e: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + else: + return_data.append(str((time1-time2).seconds)) + return_data.append(((time1-time2).seconds)/60) + return_data.append(((time1-time2).seconds)/3600) + return_data.append((time1-time2).days) + else: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + elif(x==2): + continue + elif(x==3): + continue + elif(x==4): + return_data.append(long(data[4])) + elif(x==5): + if(term['not_null'](data_set[1])): + return_data.append(len(data_set[1])) + else: + return_data.append(-1) + if(term['not_null'](data_set[2])): + ip_set=re.split(r'\.',data_set[2]) + return_data.append(ip_set[0]) + return_data.append(ip_set[1]) + return_data.append(ip_set[2]) + return_data.append(ip_set[3]) + else: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + if(term['not_null'](data_set[3])): + return_data.append(int(data_set[3])) + else: + return_data.append(-1) + if(term['not_null'](data_set[5])): + return_data.append(binascii.crc32(data_set[5])) + else: + return_data.append(-1) + if(term['not_null'](data_set[6])): + return_data.append(binascii.crc32(data_set[6])) + else: + return_data.append(-1) + elif(x==7): + return_data.append(binascii.crc32(data[7])) + elif(x==9): + return_data.append(binascii.crc32(data[9])) + elif(x==11): + return_data.append(binascii.crc32(data[11])) + elif(x==13): + return_data.append(binascii.crc32(data[13])) + elif(x==15): + return_data.append(binascii.crc32(data[15])) + elif(x==17): + return_data.append(binascii.crc32(data[17])) + return return_data + # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + # #data_set[0]=null,data_set[1]=url + # data_value_dic = dict() + # for x in xrange(1,len(feature_list)+1): + # if(x==1): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==2): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==3): + # data_value_dic[feature_list[x-1]] = data_set[x] + # elif(x==4): + # data_value_dic[feature_list[x-1]] = data_set[x] + # elif(x==5): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==6): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # return data_value_dic + + +i=0 +sfh_set = list() +with open(raw_file_address,'r') as infile: + with open(ripe_file_address,'w') as outfile: + for line in infile: + i+=1 + if(i%10000==0): + print i + line_return = re.split(r';',line) + # if(int(line_return[0])==0): + # print 'td is right' + outfile.write(str(line_return[0])+',') + return_data=data_value.get_feature(line_return) + for x in range(19): + if(x==18): + outfile.write(str(return_data[18])+'\n') + else: + outfile.write(str(return_data[x])+',') |
