summaryrefslogtreecommitdiff
path: root/src/dataset_build/feature_statistics.py
diff options
context:
space:
mode:
author陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
committer陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
commitb2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
treeb7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/feature_statistics.py
parentb026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
添加inc和srcHEADmaster
Diffstat (limited to 'src/dataset_build/feature_statistics.py')
-rw-r--r--src/dataset_build/feature_statistics.py164
1 files changed, 164 insertions, 0 deletions
diff --git a/src/dataset_build/feature_statistics.py b/src/dataset_build/feature_statistics.py
new file mode 100644
index 0000000..52ae8e0
--- /dev/null
+++ b/src/dataset_build/feature_statistics.py
@@ -0,0 +1,164 @@
+import re
+import ConfigParser
+import bisect
+import random
+import ctypes
+import hashlib
+import zlib
+import binascii
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==4),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20),
+ 'not_null':(lambda x : len(x)!=0)}
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\
+ and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\
+ and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\
+ and hashed_len/float(data_line_val[3])>0.8):
+ return data_line_val
+ else:
+ return -1
+
+
+class feature_statistics(object):
+ """YSP feature_statistics"""
+ def __init__(self):
+ super(feature_statistics, self).__init__()
+ self.meida_len_statistics_set = [0,0,0,0,0,0,0]
+ self.lost_dict = dict()
+
+ def meida_len_statistics(meida_len):
+ j = bisect.bisect(breakpoints,meida_len)
+ self.meida_len_statistics_set[j-1]+=1
+
+ def data_value_statistics(data_value_dic,data_value):
+ data_value_str = str()
+ for x in xrange(0,len(feature_list)):
+ data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+','
+
+ if(self.lost_dict.has_key(data_value_str)==False):
+ self.lost_dict[data_value_str]=[0,1,0.]
+ else:
+ if (int(result[3])==1):
+ self.lost_dict[data_value_str][0] += 1
+ self.lost_dict[data_value_str][1] += 1
+ else:
+ self.lost_dict[data_value_str][1] += 1
+
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+ @staticmethod
+ def get_base_sfh(data_set):
+ base_sfh = list()
+ for x in xrange(0,10):
+ base_sfh.append(data_set[x])
+ return base_sfh
+
+
+
+
+class data_value(object):
+
+ @staticmethod
+ def get_data_values(data):
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ #data_set[0]=null,data_set[1]=url
+ data_value_dic = dict()
+ for x in xrange(1,len(feature_list)+1):
+ if(x==1):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==2):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==3):
+ data_value_dic[feature_list[x-1]] = data_set[x]
+ elif(x==4):
+ data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x]))
+ elif(x==5):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==6):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ return data_value_dic
+
+config = ConfigParser.RawConfigParser()
+config.read("feature_statistics.conf")
+
+feature_statistics_type = ("feature","type")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+
+if(feature_statistics_type=="meida_len_statistics"):
+ breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+elif(feature_statistics_type=="data_value_statistics"):
+ feature_list =[i for i in config.get("feature","feature_name").split(",")]
+# ll=ctypes.cdll.LoadLibrary
+# lib = ll("libmaatframe.so")
+
+i=0
+sfh_set = list()
+statistic = feature_statistics()
+with open(raw_file_address,'r') as infile:
+ for line in infile:
+ i+=1
+
+
+
+
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ if(feature_statistics_type=="meida_len_statistics"):
+ statistic.meida_len_statistics(line_return[3])
+ elif(feature_statistics_type=="data_value_statistics"):
+ lost_list = list()
+ statistic.meida_len_statistics(line_return)
+ for i in statistic.lost:
+ (statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1]
+ tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2]))
+ lost_list.append(tmp)
+ print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2]))
+ # if(x == len(feature_list)-1):
+ # outfile.write(data_value_dic[feature_list[x]]+'\n')
+ # else:
+ # print lost
+ # outfile.write(str(data_value_dic[feature_list[x]])+',')
+ # outfile.write(result[3])
+ # sfh_dot=list()
+ # for x in xrange(0,10):
+ # #transform sfh to dot
+ # sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x])))
+ # if(len(data_set)==7):
+ # outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\
+ # +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\
+ # +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\
+ # +','+result[19]+'\n')
+
+# with open(ripe_file_address,'w') as outfile:
+# outfile.write(str(lost))