summaryrefslogtreecommitdiff
path: root/src/dataset_build/dataset_build.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/dataset_build/dataset_build.py')
-rw-r--r--src/dataset_build/dataset_build.py144
1 files changed, 144 insertions, 0 deletions
diff --git a/src/dataset_build/dataset_build.py b/src/dataset_build/dataset_build.py
new file mode 100644
index 0000000..a832072
--- /dev/null
+++ b/src/dataset_build/dataset_build.py
@@ -0,0 +1,144 @@
+import re
+import ConfigParser
+import bisect
+import random
+import ctypes
+import hashlib
+import zlib
+import binascii
+import json
+import datetime
+import time
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+config = ConfigParser.RawConfigParser()
+config.read("dataset_build.conf")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+base_sfh_sets = config.get("file","base_sfh_sets")
+breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+feature_list =[i for i in config.get("feature","feature_name").split(",")]
+ll=ctypes.cdll.LoadLibrary
+lib = ll("libmaatframe.so")
+lost = dict()
+
+
+class data_value(object):
+
+ @staticmethod
+ def get_feature(data):
+ return_data=list()
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5])
+ for x in xrange(1,21):
+ if(x==1):
+ if(term['not_null'](data_set[6])):
+ try:
+ time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8))
+ data_set[6]=data_set[6][0:25]
+ time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S')
+ except Exception, e:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ else:
+ return_data.append(str((time1-time2).seconds))
+ return_data.append(((time1-time2).seconds)/60)
+ return_data.append(((time1-time2).seconds)/3600)
+ return_data.append((time1-time2).days)
+ else:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ elif(x==2):
+ continue
+ elif(x==3):
+ continue
+ elif(x==4):
+ return_data.append(long(data[4]))
+ elif(x==5):
+ if(term['not_null'](data_set[1])):
+ return_data.append(len(data_set[1]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[2])):
+ ip_set=re.split(r'\.',data_set[2])
+ return_data.append(ip_set[0])
+ return_data.append(ip_set[1])
+ return_data.append(ip_set[2])
+ return_data.append(ip_set[3])
+ else:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ if(term['not_null'](data_set[3])):
+ return_data.append(int(data_set[3]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[5])):
+ return_data.append(binascii.crc32(data_set[5]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[6])):
+ return_data.append(binascii.crc32(data_set[6]))
+ else:
+ return_data.append(-1)
+ elif(x==7):
+ return_data.append(binascii.crc32(data[7]))
+ elif(x==9):
+ return_data.append(binascii.crc32(data[9]))
+ elif(x==11):
+ return_data.append(binascii.crc32(data[11]))
+ elif(x==13):
+ return_data.append(binascii.crc32(data[13]))
+ elif(x==15):
+ return_data.append(binascii.crc32(data[15]))
+ elif(x==17):
+ return_data.append(binascii.crc32(data[17]))
+ return return_data
+ # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ # #data_set[0]=null,data_set[1]=url
+ # data_value_dic = dict()
+ # for x in xrange(1,len(feature_list)+1):
+ # if(x==1):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==2):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==3):
+ # data_value_dic[feature_list[x-1]] = data_set[x]
+ # elif(x==4):
+ # data_value_dic[feature_list[x-1]] = data_set[x]
+ # elif(x==5):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==6):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # return data_value_dic
+
+
+i=0
+sfh_set = list()
+with open(raw_file_address,'r') as infile:
+ with open(ripe_file_address,'w') as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = re.split(r';',line)
+ # if(int(line_return[0])==0):
+ # print 'td is right'
+ outfile.write(str(line_return[0])+',')
+ return_data=data_value.get_feature(line_return)
+ for x in range(19):
+ if(x==18):
+ outfile.write(str(return_data[18])+'\n')
+ else:
+ outfile.write(str(return_data[x])+',')