summaryrefslogtreecommitdiff
path: root/src/dataset_build/cal_information.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/dataset_build/cal_information.py')
-rw-r--r--src/dataset_build/cal_information.py133
1 files changed, 133 insertions, 0 deletions
diff --git a/src/dataset_build/cal_information.py b/src/dataset_build/cal_information.py
new file mode 100644
index 0000000..19cd95c
--- /dev/null
+++ b/src/dataset_build/cal_information.py
@@ -0,0 +1,133 @@
+import re
+import numpy
+import ConfigParser
+import binascii
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==4),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20),
+ 'not_null':(lambda x : len(x)!=0)}
+
+class calculation(object):
+ """docstring for calculation"""
+ def __init__(self, arg):
+ super(calculation, self).__init__()
+ self.arg = arg
+
+ @staticmethod
+ def cal_ent(x):
+ x_value_list = set([x[i] for i in range(x.shape[0])])
+ ent = 0.0
+ num_0 = x[x == 0].shape[0]
+ for x_value in x_value_list:
+ if(x_value==0):
+ continue
+ p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0)
+ logp = numpy.log2(p)
+ ent -=p*logp
+ return ent
+
+class data_value(object):
+ """docstring for data_value"""
+ def __init__(self, arg):
+ super(data_value, self).__init__()
+ self.arg = arg
+
+ @staticmethod
+ def get_data_values(data):
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ #data_set[0]=null,data_set[1]=url
+ data_value_dic = [long(0)]*6
+ for x in xrange(1,len(feature_list)+1):
+ if(x==1):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[0] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[0] = 0
+ elif(x==2):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[1] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[1] = 0
+ elif(x==3):
+ data_value_dic[2] = long(data_set[x])
+ elif(x==4):
+ data_value_dic[3] = long(data_set[x])
+ elif(x==5):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[4] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[4] = 0
+ elif(x==6):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[5] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[5] = 0
+ return data_value_dic
+
+config = ConfigParser.RawConfigParser()
+config.read("cal_information.conf")
+
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+feature_list =[i for i in config.get("feature","feature_name").split(",")]
+
+i=0
+with open(raw_file_address,'r') as infile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ if(i==50000):
+ break
+ line_split = re.split(";",line)
+ data_value_temp = data_value.get_data_values(line_split[5])
+ data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)])
+ data_value_temp.append(binascii.crc32(line_split[0]))
+ if(i==1):
+ a=numpy.array(data_value_temp)
+ else:
+ a=numpy.row_stack((a,numpy.array(data_value_temp)))
+
+for i in range(20):
+ if(i==0):
+ print "URL:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==1):
+ print "ServerIP:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==2):
+ print "MediaType:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==3):
+ print "MediaLen:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==4):
+ print "Etag:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==5):
+ print "LastModify:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==6):
+ print "td_0k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==7):
+ print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==8):
+ print "td_1k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==9):
+ print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==10):
+ print "td_2k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==11):
+ print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==12):
+ print "td_4k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==13):
+ print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==14):
+ print "td_8k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==15):
+ print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==16):
+ print "td_16k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==17):
+ print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==18):
+ print "td_32k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==19):
+ print "id:"+str(calculation.cal_ent(a[:,i]))
+