diff options
Diffstat (limited to 'src/dataset_build/cal_information.py')
| -rw-r--r-- | src/dataset_build/cal_information.py | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/src/dataset_build/cal_information.py b/src/dataset_build/cal_information.py new file mode 100644 index 0000000..19cd95c --- /dev/null +++ b/src/dataset_build/cal_information.py @@ -0,0 +1,133 @@ +import re +import numpy +import ConfigParser +import binascii +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==4), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20), + 'not_null':(lambda x : len(x)!=0)} + +class calculation(object): + """docstring for calculation""" + def __init__(self, arg): + super(calculation, self).__init__() + self.arg = arg + + @staticmethod + def cal_ent(x): + x_value_list = set([x[i] for i in range(x.shape[0])]) + ent = 0.0 + num_0 = x[x == 0].shape[0] + for x_value in x_value_list: + if(x_value==0): + continue + p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0) + logp = numpy.log2(p) + ent -=p*logp + return ent + +class data_value(object): + """docstring for data_value""" + def __init__(self, arg): + super(data_value, self).__init__() + self.arg = arg + + @staticmethod + def get_data_values(data): + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + #data_set[0]=null,data_set[1]=url + data_value_dic = [long(0)]*6 + for x in xrange(1,len(feature_list)+1): + if(x==1): + if(term['not_null'](data_set[x])==True): + data_value_dic[0] = binascii.crc32(data_set[x]) + else: + data_value_dic[0] = 0 + elif(x==2): + if(term['not_null'](data_set[x])==True): + data_value_dic[1] = binascii.crc32(data_set[x]) + else: + data_value_dic[1] = 0 + elif(x==3): + data_value_dic[2] = long(data_set[x]) + elif(x==4): + data_value_dic[3] = long(data_set[x]) + elif(x==5): + if(term['not_null'](data_set[x])==True): + data_value_dic[4] = binascii.crc32(data_set[x]) + else: + data_value_dic[4] = 0 + elif(x==6): + if(term['not_null'](data_set[x])==True): + data_value_dic[5] = binascii.crc32(data_set[x]) + else: + data_value_dic[5] = 0 + return data_value_dic + +config = ConfigParser.RawConfigParser() +config.read("cal_information.conf") + +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") +feature_list =[i for i in config.get("feature","feature_name").split(",")] + +i=0 +with open(raw_file_address,'r') as infile: + for line in infile: + i+=1 + if(i%10000==0): + print i + if(i==50000): + break + line_split = re.split(";",line) + data_value_temp = data_value.get_data_values(line_split[5]) + data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)]) + data_value_temp.append(binascii.crc32(line_split[0])) + if(i==1): + a=numpy.array(data_value_temp) + else: + a=numpy.row_stack((a,numpy.array(data_value_temp))) + +for i in range(20): + if(i==0): + print "URL:"+str(calculation.cal_ent(a[:,i])) + elif(i==1): + print "ServerIP:"+str(calculation.cal_ent(a[:,i])) + elif(i==2): + print "MediaType:"+str(calculation.cal_ent(a[:,i])) + elif(i==3): + print "MediaLen:"+str(calculation.cal_ent(a[:,i])) + elif(i==4): + print "Etag:"+str(calculation.cal_ent(a[:,i])) + elif(i==5): + print "LastModify:"+str(calculation.cal_ent(a[:,i])) + elif(i==6): + print "td_0k:"+str(calculation.cal_ent(a[:,i])) + elif(i==7): + print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i])) + elif(i==8): + print "td_1k:"+str(calculation.cal_ent(a[:,i])) + elif(i==9): + print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i])) + elif(i==10): + print "td_2k:"+str(calculation.cal_ent(a[:,i])) + elif(i==11): + print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i])) + elif(i==12): + print "td_4k:"+str(calculation.cal_ent(a[:,i])) + elif(i==13): + print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i])) + elif(i==14): + print "td_8k:"+str(calculation.cal_ent(a[:,i])) + elif(i==15): + print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i])) + elif(i==16): + print "td_16k:"+str(calculation.cal_ent(a[:,i])) + elif(i==17): + print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i])) + elif(i==18): + print "td_32k:"+str(calculation.cal_ent(a[:,i])) + elif(i==19): + print "id:"+str(calculation.cal_ent(a[:,i])) + |
