diff options
Diffstat (limited to 'src/dataset_build/feature_statistics.py')
| -rw-r--r-- | src/dataset_build/feature_statistics.py | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/src/dataset_build/feature_statistics.py b/src/dataset_build/feature_statistics.py new file mode 100644 index 0000000..52ae8e0 --- /dev/null +++ b/src/dataset_build/feature_statistics.py @@ -0,0 +1,164 @@ +import re +import ConfigParser +import bisect +import random +import ctypes +import hashlib +import zlib +import binascii + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==4), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20), + 'not_null':(lambda x : len(x)!=0)} + +class data_line(object): + """docstring for ClassName""" + def __init__(self): + super(ClassName, self).__init__() + + @staticmethod + def if_error(data_line_str): + data_line_val = re.split(r';',data_line_str) + hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) + if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\ + and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\ + and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\ + and hashed_len/float(data_line_val[3])>0.8): + return data_line_val + else: + return -1 + + +class feature_statistics(object): + """YSP feature_statistics""" + def __init__(self): + super(feature_statistics, self).__init__() + self.meida_len_statistics_set = [0,0,0,0,0,0,0] + self.lost_dict = dict() + + def meida_len_statistics(meida_len): + j = bisect.bisect(breakpoints,meida_len) + self.meida_len_statistics_set[j-1]+=1 + + def data_value_statistics(data_value_dic,data_value): + data_value_str = str() + for x in xrange(0,len(feature_list)): + data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+',' + + if(self.lost_dict.has_key(data_value_str)==False): + self.lost_dict[data_value_str]=[0,1,0.] + else: + if (int(result[3])==1): + self.lost_dict[data_value_str][0] += 1 + self.lost_dict[data_value_str][1] += 1 + else: + self.lost_dict[data_value_str][1] += 1 + + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + + @staticmethod + def get_base_sfh(data_set): + base_sfh = list() + for x in xrange(0,10): + base_sfh.append(data_set[x]) + return base_sfh + + + + +class data_value(object): + + @staticmethod + def get_data_values(data): + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + #data_set[0]=null,data_set[1]=url + data_value_dic = dict() + for x in xrange(1,len(feature_list)+1): + if(x==1): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==2): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==3): + data_value_dic[feature_list[x-1]] = data_set[x] + elif(x==4): + data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x])) + elif(x==5): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==6): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + return data_value_dic + +config = ConfigParser.RawConfigParser() +config.read("feature_statistics.conf") + +feature_statistics_type = ("feature","type") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") + +if(feature_statistics_type=="meida_len_statistics"): + breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +elif(feature_statistics_type=="data_value_statistics"): + feature_list =[i for i in config.get("feature","feature_name").split(",")] +# ll=ctypes.cdll.LoadLibrary +# lib = ll("libmaatframe.so") + +i=0 +sfh_set = list() +statistic = feature_statistics() +with open(raw_file_address,'r') as infile: + for line in infile: + i+=1 + + + + + line_return = data_line.if_error(line) + if(line_return != -1): + if(feature_statistics_type=="meida_len_statistics"): + statistic.meida_len_statistics(line_return[3]) + elif(feature_statistics_type=="data_value_statistics"): + lost_list = list() + statistic.meida_len_statistics(line_return) + for i in statistic.lost: + (statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1] + tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2])) + lost_list.append(tmp) + print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2])) + # if(x == len(feature_list)-1): + # outfile.write(data_value_dic[feature_list[x]]+'\n') + # else: + # print lost + # outfile.write(str(data_value_dic[feature_list[x]])+',') + # outfile.write(result[3]) + # sfh_dot=list() + # for x in xrange(0,10): + # #transform sfh to dot + # sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x]))) + # if(len(data_set)==7): + # outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\ + # +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\ + # +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\ + # +','+result[19]+'\n') + +# with open(ripe_file_address,'w') as outfile: +# outfile.write(str(lost)) |
