import re import numpy import ConfigParser import binascii term = {'td_len':(lambda x : len(x)==32), 'data_num':(lambda x : len(x)==4), 'url':(lambda x : x.find['NUll']), 'sfh_len':(lambda x : len(x)>20), 'not_null':(lambda x : len(x)!=0)} class calculation(object): """docstring for calculation""" def __init__(self, arg): super(calculation, self).__init__() self.arg = arg @staticmethod def cal_ent(x): x_value_list = set([x[i] for i in range(x.shape[0])]) ent = 0.0 num_0 = x[x == 0].shape[0] for x_value in x_value_list: if(x_value==0): continue p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0) logp = numpy.log2(p) ent -=p*logp return ent class data_value(object): """docstring for data_value""" def __init__(self, arg): super(data_value, self).__init__() self.arg = arg @staticmethod def get_data_values(data): data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) #data_set[0]=null,data_set[1]=url data_value_dic = [long(0)]*6 for x in xrange(1,len(feature_list)+1): if(x==1): if(term['not_null'](data_set[x])==True): data_value_dic[0] = binascii.crc32(data_set[x]) else: data_value_dic[0] = 0 elif(x==2): if(term['not_null'](data_set[x])==True): data_value_dic[1] = binascii.crc32(data_set[x]) else: data_value_dic[1] = 0 elif(x==3): data_value_dic[2] = long(data_set[x]) elif(x==4): data_value_dic[3] = long(data_set[x]) elif(x==5): if(term['not_null'](data_set[x])==True): data_value_dic[4] = binascii.crc32(data_set[x]) else: data_value_dic[4] = 0 elif(x==6): if(term['not_null'](data_set[x])==True): data_value_dic[5] = binascii.crc32(data_set[x]) else: data_value_dic[5] = 0 return data_value_dic config = ConfigParser.RawConfigParser() config.read("cal_information.conf") raw_file_address = config.get("file","raw_file_address") ripe_file_address = config.get("file","ripe_file_address") feature_list =[i for i in config.get("feature","feature_name").split(",")] i=0 with open(raw_file_address,'r') as infile: for line in infile: i+=1 if(i%10000==0): print i if(i==50000): break line_split = re.split(";",line) data_value_temp = data_value.get_data_values(line_split[5]) data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)]) data_value_temp.append(binascii.crc32(line_split[0])) if(i==1): a=numpy.array(data_value_temp) else: a=numpy.row_stack((a,numpy.array(data_value_temp))) for i in range(20): if(i==0): print "URL:"+str(calculation.cal_ent(a[:,i])) elif(i==1): print "ServerIP:"+str(calculation.cal_ent(a[:,i])) elif(i==2): print "MediaType:"+str(calculation.cal_ent(a[:,i])) elif(i==3): print "MediaLen:"+str(calculation.cal_ent(a[:,i])) elif(i==4): print "Etag:"+str(calculation.cal_ent(a[:,i])) elif(i==5): print "LastModify:"+str(calculation.cal_ent(a[:,i])) elif(i==6): print "td_0k:"+str(calculation.cal_ent(a[:,i])) elif(i==7): print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i])) elif(i==8): print "td_1k:"+str(calculation.cal_ent(a[:,i])) elif(i==9): print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i])) elif(i==10): print "td_2k:"+str(calculation.cal_ent(a[:,i])) elif(i==11): print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i])) elif(i==12): print "td_4k:"+str(calculation.cal_ent(a[:,i])) elif(i==13): print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i])) elif(i==14): print "td_8k:"+str(calculation.cal_ent(a[:,i])) elif(i==15): print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i])) elif(i==16): print "td_16k:"+str(calculation.cal_ent(a[:,i])) elif(i==17): print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i])) elif(i==18): print "td_32k:"+str(calculation.cal_ent(a[:,i])) elif(i==19): print "id:"+str(calculation.cal_ent(a[:,i]))