diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/dataset_build')
| -rw-r--r-- | src/dataset_build/CMakeLists.txt | 11 | ||||
| -rw-r--r-- | src/dataset_build/based_sfh.conf | 3 | ||||
| -rw-r--r-- | src/dataset_build/based_sfh.py | 44 | ||||
| -rw-r--r-- | src/dataset_build/cal_information.conf | 5 | ||||
| -rw-r--r-- | src/dataset_build/cal_information.py | 133 | ||||
| -rw-r--r-- | src/dataset_build/dataset_build.conf | 8 | ||||
| -rw-r--r-- | src/dataset_build/dataset_build.py | 144 | ||||
| -rw-r--r-- | src/dataset_build/feature_statistics.conf | 8 | ||||
| -rw-r--r-- | src/dataset_build/feature_statistics.py | 164 | ||||
| -rw-r--r-- | src/dataset_build/file_digest.py | 96 | ||||
| -rw-r--r-- | src/dataset_build/get_lost.c | 116 | ||||
| -rw-r--r-- | src/dataset_build/grain.conf | 5 | ||||
| -rw-r--r-- | src/dataset_build/td_classification.py | 5 | ||||
| -rw-r--r-- | src/dataset_build/vedio_id_build.c | 171 |
14 files changed, 913 insertions, 0 deletions
diff --git a/src/dataset_build/CMakeLists.txt b/src/dataset_build/CMakeLists.txt new file mode 100644 index 0000000..8840a74 --- /dev/null +++ b/src/dataset_build/CMakeLists.txt @@ -0,0 +1,11 @@ +PROJECT (CALCULATE) +SET (SRC_LIST get_lost.c) +SET(CMAKE_BUILD_TYPE "Debug") +SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb") +SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") +MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR}) +MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR}) +#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/) +#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/) +ADD_EXECUTABLE(get_lost ${SRC_LIST}) +TARGET_LINK_LIBRARIES(get_lost maatframe libMESA_htable.so pthread m) diff --git a/src/dataset_build/based_sfh.conf b/src/dataset_build/based_sfh.conf new file mode 100644 index 0000000..cdcf4cf --- /dev/null +++ b/src/dataset_build/based_sfh.conf @@ -0,0 +1,3 @@ +[file] +raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest_nots +ripe_file_address = ../../data/td_data_set/td_data_20171207/base_sfh_set
\ No newline at end of file diff --git a/src/dataset_build/based_sfh.py b/src/dataset_build/based_sfh.py new file mode 100644 index 0000000..b3281ce --- /dev/null +++ b/src/dataset_build/based_sfh.py @@ -0,0 +1,44 @@ +import re +import ConfigParser +import bisect +import random + +term = {'not_null':(lambda x : len(x)!=0)} + +config = ConfigParser.RawConfigParser() +config.read("based_sfh.conf") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + +i=0 +sfh_set = list() +with open(raw_file_address,'r') as infile: + with open(ripe_file_address,'w') as outfile: + for line in infile: + i+=1 + if(i%100000==0): + print i + result = re.split(r';',line) + if(term['not_null'](result[3]) and term['not_null'](result[19])): + hashed_len = sfh_fingerprint.get_hashed_len(result[19]) + if(hashed_len/int(result[3])>0.8): + outfile.write(result[19]+'\n')
\ No newline at end of file diff --git a/src/dataset_build/cal_information.conf b/src/dataset_build/cal_information.conf new file mode 100644 index 0000000..1571b8b --- /dev/null +++ b/src/dataset_build/cal_information.conf @@ -0,0 +1,5 @@ +[file] +raw_file_address = ../../data/ripe_data/td_data_20171207/video_id.txt +ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic +[feature] +feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/src/dataset_build/cal_information.py b/src/dataset_build/cal_information.py new file mode 100644 index 0000000..19cd95c --- /dev/null +++ b/src/dataset_build/cal_information.py @@ -0,0 +1,133 @@ +import re +import numpy +import ConfigParser +import binascii +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==4), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20), + 'not_null':(lambda x : len(x)!=0)} + +class calculation(object): + """docstring for calculation""" + def __init__(self, arg): + super(calculation, self).__init__() + self.arg = arg + + @staticmethod + def cal_ent(x): + x_value_list = set([x[i] for i in range(x.shape[0])]) + ent = 0.0 + num_0 = x[x == 0].shape[0] + for x_value in x_value_list: + if(x_value==0): + continue + p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0) + logp = numpy.log2(p) + ent -=p*logp + return ent + +class data_value(object): + """docstring for data_value""" + def __init__(self, arg): + super(data_value, self).__init__() + self.arg = arg + + @staticmethod + def get_data_values(data): + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + #data_set[0]=null,data_set[1]=url + data_value_dic = [long(0)]*6 + for x in xrange(1,len(feature_list)+1): + if(x==1): + if(term['not_null'](data_set[x])==True): + data_value_dic[0] = binascii.crc32(data_set[x]) + else: + data_value_dic[0] = 0 + elif(x==2): + if(term['not_null'](data_set[x])==True): + data_value_dic[1] = binascii.crc32(data_set[x]) + else: + data_value_dic[1] = 0 + elif(x==3): + data_value_dic[2] = long(data_set[x]) + elif(x==4): + data_value_dic[3] = long(data_set[x]) + elif(x==5): + if(term['not_null'](data_set[x])==True): + data_value_dic[4] = binascii.crc32(data_set[x]) + else: + data_value_dic[4] = 0 + elif(x==6): + if(term['not_null'](data_set[x])==True): + data_value_dic[5] = binascii.crc32(data_set[x]) + else: + data_value_dic[5] = 0 + return data_value_dic + +config = ConfigParser.RawConfigParser() +config.read("cal_information.conf") + +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") +feature_list =[i for i in config.get("feature","feature_name").split(",")] + +i=0 +with open(raw_file_address,'r') as infile: + for line in infile: + i+=1 + if(i%10000==0): + print i + if(i==50000): + break + line_split = re.split(";",line) + data_value_temp = data_value.get_data_values(line_split[5]) + data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)]) + data_value_temp.append(binascii.crc32(line_split[0])) + if(i==1): + a=numpy.array(data_value_temp) + else: + a=numpy.row_stack((a,numpy.array(data_value_temp))) + +for i in range(20): + if(i==0): + print "URL:"+str(calculation.cal_ent(a[:,i])) + elif(i==1): + print "ServerIP:"+str(calculation.cal_ent(a[:,i])) + elif(i==2): + print "MediaType:"+str(calculation.cal_ent(a[:,i])) + elif(i==3): + print "MediaLen:"+str(calculation.cal_ent(a[:,i])) + elif(i==4): + print "Etag:"+str(calculation.cal_ent(a[:,i])) + elif(i==5): + print "LastModify:"+str(calculation.cal_ent(a[:,i])) + elif(i==6): + print "td_0k:"+str(calculation.cal_ent(a[:,i])) + elif(i==7): + print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i])) + elif(i==8): + print "td_1k:"+str(calculation.cal_ent(a[:,i])) + elif(i==9): + print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i])) + elif(i==10): + print "td_2k:"+str(calculation.cal_ent(a[:,i])) + elif(i==11): + print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i])) + elif(i==12): + print "td_4k:"+str(calculation.cal_ent(a[:,i])) + elif(i==13): + print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i])) + elif(i==14): + print "td_8k:"+str(calculation.cal_ent(a[:,i])) + elif(i==15): + print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i])) + elif(i==16): + print "td_16k:"+str(calculation.cal_ent(a[:,i])) + elif(i==17): + print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i])) + elif(i==18): + print "td_32k:"+str(calculation.cal_ent(a[:,i])) + elif(i==19): + print "id:"+str(calculation.cal_ent(a[:,i])) + diff --git a/src/dataset_build/dataset_build.conf b/src/dataset_build/dataset_build.conf new file mode 100644 index 0000000..400e160 --- /dev/null +++ b/src/dataset_build/dataset_build.conf @@ -0,0 +1,8 @@ +[file] +raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level +ripe_file_address = ../../data/td_data_set/td_data_20171207/td_dataset +base_sfh_sets = ../../data/td_data_set/td_data_20171207/base_sfh_set +[output] +breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304 +[feature] +feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/src/dataset_build/dataset_build.py b/src/dataset_build/dataset_build.py new file mode 100644 index 0000000..a832072 --- /dev/null +++ b/src/dataset_build/dataset_build.py @@ -0,0 +1,144 @@ +import re +import ConfigParser +import bisect +import random +import ctypes +import hashlib +import zlib +import binascii +import json +import datetime +import time + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==21), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), + 'not_null':(lambda x : len(x)!=0), + 'ysp_len':(lambda x : int(x)!=0), + 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} + +config = ConfigParser.RawConfigParser() +config.read("dataset_build.conf") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") +base_sfh_sets = config.get("file","base_sfh_sets") +breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +feature_list =[i for i in config.get("feature","feature_name").split(",")] +ll=ctypes.cdll.LoadLibrary +lib = ll("libmaatframe.so") +lost = dict() + + +class data_value(object): + + @staticmethod + def get_feature(data): + return_data=list() + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5]) + for x in xrange(1,21): + if(x==1): + if(term['not_null'](data_set[6])): + try: + time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8)) + data_set[6]=data_set[6][0:25] + time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S') + except Exception, e: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + else: + return_data.append(str((time1-time2).seconds)) + return_data.append(((time1-time2).seconds)/60) + return_data.append(((time1-time2).seconds)/3600) + return_data.append((time1-time2).days) + else: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + elif(x==2): + continue + elif(x==3): + continue + elif(x==4): + return_data.append(long(data[4])) + elif(x==5): + if(term['not_null'](data_set[1])): + return_data.append(len(data_set[1])) + else: + return_data.append(-1) + if(term['not_null'](data_set[2])): + ip_set=re.split(r'\.',data_set[2]) + return_data.append(ip_set[0]) + return_data.append(ip_set[1]) + return_data.append(ip_set[2]) + return_data.append(ip_set[3]) + else: + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + return_data.append(-1) + if(term['not_null'](data_set[3])): + return_data.append(int(data_set[3])) + else: + return_data.append(-1) + if(term['not_null'](data_set[5])): + return_data.append(binascii.crc32(data_set[5])) + else: + return_data.append(-1) + if(term['not_null'](data_set[6])): + return_data.append(binascii.crc32(data_set[6])) + else: + return_data.append(-1) + elif(x==7): + return_data.append(binascii.crc32(data[7])) + elif(x==9): + return_data.append(binascii.crc32(data[9])) + elif(x==11): + return_data.append(binascii.crc32(data[11])) + elif(x==13): + return_data.append(binascii.crc32(data[13])) + elif(x==15): + return_data.append(binascii.crc32(data[15])) + elif(x==17): + return_data.append(binascii.crc32(data[17])) + return return_data + # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + # #data_set[0]=null,data_set[1]=url + # data_value_dic = dict() + # for x in xrange(1,len(feature_list)+1): + # if(x==1): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==2): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==3): + # data_value_dic[feature_list[x-1]] = data_set[x] + # elif(x==4): + # data_value_dic[feature_list[x-1]] = data_set[x] + # elif(x==5): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # elif(x==6): + # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) + # return data_value_dic + + +i=0 +sfh_set = list() +with open(raw_file_address,'r') as infile: + with open(ripe_file_address,'w') as outfile: + for line in infile: + i+=1 + if(i%10000==0): + print i + line_return = re.split(r';',line) + # if(int(line_return[0])==0): + # print 'td is right' + outfile.write(str(line_return[0])+',') + return_data=data_value.get_feature(line_return) + for x in range(19): + if(x==18): + outfile.write(str(return_data[18])+'\n') + else: + outfile.write(str(return_data[x])+',') diff --git a/src/dataset_build/feature_statistics.conf b/src/dataset_build/feature_statistics.conf new file mode 100644 index 0000000..12cf089 --- /dev/null +++ b/src/dataset_build/feature_statistics.conf @@ -0,0 +1,8 @@ +[file] +raw_file_address = ../../data/td_data_set/td_data_20171207/td.txt +ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic +[output] +breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,44194304 +[feature] +type = data_value_statistics +feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/src/dataset_build/feature_statistics.py b/src/dataset_build/feature_statistics.py new file mode 100644 index 0000000..52ae8e0 --- /dev/null +++ b/src/dataset_build/feature_statistics.py @@ -0,0 +1,164 @@ +import re +import ConfigParser +import bisect +import random +import ctypes +import hashlib +import zlib +import binascii + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==4), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20), + 'not_null':(lambda x : len(x)!=0)} + +class data_line(object): + """docstring for ClassName""" + def __init__(self): + super(ClassName, self).__init__() + + @staticmethod + def if_error(data_line_str): + data_line_val = re.split(r';',data_line_str) + hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) + if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\ + and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\ + and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\ + and hashed_len/float(data_line_val[3])>0.8): + return data_line_val + else: + return -1 + + +class feature_statistics(object): + """YSP feature_statistics""" + def __init__(self): + super(feature_statistics, self).__init__() + self.meida_len_statistics_set = [0,0,0,0,0,0,0] + self.lost_dict = dict() + + def meida_len_statistics(meida_len): + j = bisect.bisect(breakpoints,meida_len) + self.meida_len_statistics_set[j-1]+=1 + + def data_value_statistics(data_value_dic,data_value): + data_value_str = str() + for x in xrange(0,len(feature_list)): + data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+',' + + if(self.lost_dict.has_key(data_value_str)==False): + self.lost_dict[data_value_str]=[0,1,0.] + else: + if (int(result[3])==1): + self.lost_dict[data_value_str][0] += 1 + self.lost_dict[data_value_str][1] += 1 + else: + self.lost_dict[data_value_str][1] += 1 + + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + + @staticmethod + def get_base_sfh(data_set): + base_sfh = list() + for x in xrange(0,10): + base_sfh.append(data_set[x]) + return base_sfh + + + + +class data_value(object): + + @staticmethod + def get_data_values(data): + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) + #data_set[0]=null,data_set[1]=url + data_value_dic = dict() + for x in xrange(1,len(feature_list)+1): + if(x==1): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==2): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==3): + data_value_dic[feature_list[x-1]] = data_set[x] + elif(x==4): + data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x])) + elif(x==5): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + elif(x==6): + data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 + return data_value_dic + +config = ConfigParser.RawConfigParser() +config.read("feature_statistics.conf") + +feature_statistics_type = ("feature","type") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") + +if(feature_statistics_type=="meida_len_statistics"): + breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +elif(feature_statistics_type=="data_value_statistics"): + feature_list =[i for i in config.get("feature","feature_name").split(",")] +# ll=ctypes.cdll.LoadLibrary +# lib = ll("libmaatframe.so") + +i=0 +sfh_set = list() +statistic = feature_statistics() +with open(raw_file_address,'r') as infile: + for line in infile: + i+=1 + + + + + line_return = data_line.if_error(line) + if(line_return != -1): + if(feature_statistics_type=="meida_len_statistics"): + statistic.meida_len_statistics(line_return[3]) + elif(feature_statistics_type=="data_value_statistics"): + lost_list = list() + statistic.meida_len_statistics(line_return) + for i in statistic.lost: + (statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1] + tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2])) + lost_list.append(tmp) + print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2])) + # if(x == len(feature_list)-1): + # outfile.write(data_value_dic[feature_list[x]]+'\n') + # else: + # print lost + # outfile.write(str(data_value_dic[feature_list[x]])+',') + # outfile.write(result[3]) + # sfh_dot=list() + # for x in xrange(0,10): + # #transform sfh to dot + # sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x]))) + # if(len(data_set)==7): + # outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\ + # +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\ + # +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\ + # +','+result[19]+'\n') + +# with open(ripe_file_address,'w') as outfile: +# outfile.write(str(lost)) diff --git a/src/dataset_build/file_digest.py b/src/dataset_build/file_digest.py new file mode 100644 index 0000000..590e059 --- /dev/null +++ b/src/dataset_build/file_digest.py @@ -0,0 +1,96 @@ +#-*-coding:utf-8-*- +import re +import random +import ConfigParser +import bisect +import commands +import os +import hashlib + +class data_line(object): + """docstring for ClassName""" + def __init__(self): + super(ClassName, self).__init__() + + @staticmethod + def if_error(data_line_str): + data_line_val = re.split(r';',data_line_str) + hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) + if(term['data_num'](data_line_val) and \ + term['not_null'](data_line_val[0]) and \ + term['ysp_len'](data_line_val[3]) and \ + term['not_null'](data_line_val[4]) and \ + term['td_len'](data_line_val[6]) and \ + term['td_len'](data_line_val[8]) and \ + term['td_len'](data_line_val[10]) and \ + term['td_len'](data_line_val[12]) and \ + term['td_len'](data_line_val[14]) and \ + term['td_len'](data_line_val[16]) and \ + term['not_null'](data_line_val[18]) and \ + term['sfh_len'](data_line_val[19]) and \ + term['not_null'](data_line_val[20]) and \ + hashed_len/float(data_line_val[3])>=0.8): + return data_line_val + else: + return -1 + +class TD_fingerprint(object): + def __init__(): + self.td = td + self.td_string = td_string + @staticmethod + def td_generate(td_string): + td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest() + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==21), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), + 'not_null':(lambda x : len(x)!=0), + 'ysp_len':(lambda x : int(x)!=0), + 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} + +grain="./get_lost" +ripe_files=[] +config = ConfigParser.RawConfigParser() +config.read("grain.conf") +raw_file_address=config.get("file","raw_file_address") +ripe_files_address=config.get("file","ripe_files_address") +print ("%s %s" %(raw_file_address,ripe_files_address)) +num = [0,0,0,0,0,0,0] +breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +# i=0 +# for i in xrange(0,ripe_file_num): +# outfile=open(ripe_files_address+str(i)+'.txt','w') +# ripe_files.append(outfile) + +i=0 +with open(raw_file_address,'r') as infile: +# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile: + with open(ripe_files_address,'w')as outfile: + for line in infile: + i+=1 + if(i%10000==0): + print i + line_return = data_line.if_error(line) + if(line_return != -1): + outfile.write(str(line))
\ No newline at end of file diff --git a/src/dataset_build/get_lost.c b/src/dataset_build/get_lost.c new file mode 100644 index 0000000..0e6c452 --- /dev/null +++ b/src/dataset_build/get_lost.c @@ -0,0 +1,116 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define HTABLE_SIZE 8*64*1024*1024 +#define SFH_PASS_RATE 0.8 +#define SIMILIAR 80 + +typedef struct td +{ + char * tdstr; + unsigned int lost; +}td; + +typedef struct file_sfh_data +{ + long id; + char * sfh; + td * td_value; + char * td_ori; +}file_sfh_data; + +int main(int argc,char *argv[]) +{ + FILE *fpread;//文件 + FILE *fpwrite;//write file handle + int array_size = 1024; + file_sfh_data **file_data=(file_sfh_data **)malloc(sizeof(file_sfh_data)*array_size); + char* dirstr = "../../data/td_data_set/td_data_20171207/td_sfh_lost"; + //char* dirstr = *++argv; + char* writestr = "../../data/td_data_set/td_data_20171207/td.txt"; + int total_len = 0; + char TD_tmp[256], SFH_tmp[1024*300], TD_ORI[1024*10]; + char buffer[1024*300+1]; + int ret = 0; + int line = 0; + int thread_safe = 0; + int i; + int id; + int similiarity; + MESA_htable_handle htable = NULL; + fpread=fopen(dirstr,"rb"); + fpwrite=fopen(writestr,"w"); + printf("file str is %s\n",dirstr); + if(fpread==NULL) + { + printf("open file error\n"); + return -1; + } + buffer[sizeof(buffer)]='\0'; + while(feof(fpread)==0) + { + fgets(buffer,sizeof(buffer)-1,fpread); + ret=sscanf(buffer,"%d;%[^;];%[^;];%s",&total_len,TD_ORI,TD_tmp,SFH_tmp); + if(ret!=4) + { + continue; + } + file_data[line]=(file_sfh_data*)calloc(1,sizeof(file_sfh_data)); + file_data[line]->id=line; + file_data[line]->sfh=strdup(SFH_tmp); + file_data[line]->td_value=(td*)calloc(1,sizeof(td)); + file_data[line]->td_value->tdstr=strdup(TD_tmp); + file_data[line]->td_value->lost=0; + file_data[line]->td_ori=strdup(TD_ORI); + line++; + if(line==array_size) + { + array_size*=2; + file_data=realloc(file_data,sizeof(file_sfh_data)*array_size); + } + } + printf("read file success!\n"); + htable = NULL; + htable=MESA_htable_born(); + thread_safe = 0; + MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); + unsigned int slot_size=1024*1024*16; + MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(slot_size)); + MESA_htable_mature(htable); + for(i=0;i<line;i++) + { + if(MESA_htable_add(htable,(char*)(file_data[i]->td_value->tdstr),32,(void *)file_data[i]->id)<0) + { + id=(long)MESA_htable_search(htable,(char*)file_data[i]->td_value->tdstr,32); + similiarity=GIE_sfh_similiarity(file_data[id]->sfh,(int)strlen(file_data[id]->sfh),file_data[i]->sfh,(int)strlen(file_data[i]->sfh)); + if(similiarity<SIMILIAR) + { + file_data[id]->td_value->lost = 1; + file_data[i]->td_value->lost = 1; + } + } + } + for(i=0;i<line;i++) + { + fprintf(fpwrite,"%s;%s;%s;%d\n",file_data[i]->td_value->tdstr,file_data[i]->sfh,file_data[i]->td_ori,file_data[i]->td_value->lost); + } + for(i=0;i<line;i++) + { + free(file_data[i]->sfh); + file_data[i]->sfh=NULL; + free(file_data[i]->td_value->tdstr); + file_data[i]->td_value->tdstr=NULL; + free(file_data[i]->td_value); + file_data[i]->td_value=NULL; + free(file_data[i]->td_ori); + file_data[i]->td_ori=NULL; + free(file_data[i]); + file_data[i]=NULL; + } + fclose(fpread); + fclose(fpwrite); + return 0; +}
\ No newline at end of file diff --git a/src/dataset_build/grain.conf b/src/dataset_build/grain.conf new file mode 100644 index 0000000..944b337 --- /dev/null +++ b/src/dataset_build/grain.conf @@ -0,0 +1,5 @@ +[file] +ripe_files_address = ../../data/td_data_set/td_data_20171207/get_lost_raw_data +raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest +[output] +breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304
\ No newline at end of file diff --git a/src/dataset_build/td_classification.py b/src/dataset_build/td_classification.py new file mode 100644 index 0000000..8d4b97c --- /dev/null +++ b/src/dataset_build/td_classification.py @@ -0,0 +1,5 @@ +from sklearn.datasets import load_iris +from sklearn import tree + +with open() as infile: +
\ No newline at end of file diff --git a/src/dataset_build/vedio_id_build.c b/src/dataset_build/vedio_id_build.c new file mode 100644 index 0000000..9faaa64 --- /dev/null +++ b/src/dataset_build/vedio_id_build.c @@ -0,0 +1,171 @@ +/* +gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define BUFFER_LEN (10*1024) +#define SFH_PASS_RATE 0.9 +#define SFH_LEN (10*1024) +#define URL_LEN (10*1024) + +typedef struct video_id +{ + long id; + char *sfh; +}video_id; + +typedef struct cache +{ + GIE_digest_t ** GIE_cache; + long cache_size; + long len; +}cache; + +long get_hashed_len(const char* sfh) +{ + char *data=(char*)malloc(strlen(sfh)+1); + memcpy(data,sfh, strlen(sfh)); + data[strlen(sfh)]='\0'; + char *token=NULL,*sub_token=NULL,*saveptr; + long left_offset=0,right_offset=0,hashed_length=0; + int ret=0,first=0; + for (token = data; ;token= NULL) + { + sub_token= strtok_r(token,"[", &saveptr); + if (sub_token == NULL) + { + break; + } + if(first==0)//jump over the first sub string. + { + first=1; + continue; + } + ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); + if(ret!=2) + { + return 0; + } + assert(ret==2); + hashed_length+=right_offset-left_offset+1; + } + //printf("hashed length=%ld\n",hashed_length); + free(data); + return hashed_length/2; +} + +int main(int argc,char *argv[]) +{ + FILE *video_id_sets_file; + FILE *new_sfh_file; + const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data"; + const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt"; + char *buffer=NULL; + int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0; + int update = 0,video_id = 0,j = 0; + int* temp_int = NULL; + float temp_sfh_pass = 0; + char *sfh_str,*url_str; + GIE_digest_t *sfh_video_id = NULL; + GIE_result_t *query_result = NULL; + cache *GIE_digest_cache = NULL; + video_id_sets_file = fopen(video_id_sets_file_dir,"r+"); + new_sfh_file = fopen(new_sfh_file_dir,"w"); + if(video_id_sets_file == NULL) + { + printf("open video_id_sets_file error\n"); + return -1; + } + if(new_sfh_file == NULL) + { + printf("open new_sfh_file error\n"); + return -1; + } + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t)); + query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t)); + GIE_handle_t *query_handle; + query_para->gram_value = 7; + query_para->position_accuracy = 5; + query_handle=GIE_create((const GIE_create_para_t *)query_para); + free(query_para); + if(query_handle==NULL) + { + printf("create GIE handle error\n"); + return -1; + } + sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + url_str = (char*)calloc(URL_LEN,sizeof(char)); + i=0; + GIE_digest_cache =(cache*)calloc(1,sizeof(cache)); + GIE_digest_cache->cache_size = 1000; + GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*)); + GIE_digest_cache->len = 0; + while(feof(video_id_sets_file)==0) + { + i++; + if(i%10000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,video_id_sets_file); + ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + %*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str); + if(ret!=2) + { + continue; + } + hashed_len = get_hashed_len((const char*)sfh_str); + temp_sfh_pass = (float)hashed_len/total_len; + if(temp_sfh_pass<SFH_PASS_RATE) + { + continue; + } + resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1); + if(resultnum == 0) + { + temp_int=(int*)calloc(1,sizeof(int)); + *temp_int=i; + sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_video_id->id=i; + sfh_video_id->sfh_length=strlen(sfh_str); + sfh_video_id->operation=GIE_INSERT_OPT; + sfh_video_id->cfds_lvl=5; + sfh_video_id->sfh=strdup(sfh_str); + sfh_video_id->tag=temp_int; + GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id; + GIE_digest_cache->len++; + if(GIE_digest_cache->len==GIE_digest_cache->cache_size) + { + update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size); + GIE_digest_cache->len=0; + for(j=0;j<GIE_digest_cache->cache_size;j++) + { + free(GIE_digest_cache->GIE_cache[j]->sfh); + GIE_digest_cache->GIE_cache[j]->sfh=NULL; + free(GIE_digest_cache->GIE_cache[j]); + GIE_digest_cache->GIE_cache[j]=NULL; + } + } + fprintf(new_sfh_file,"%d,%s",i,buffer); + } + else + { + fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer); + } + } + free(buffer); + free(query_result); + free(sfh_video_id); + free(url_str); + free(sfh_str); + free(GIE_digest_cache); + return 0; +}
\ No newline at end of file |
