diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /dataset_build | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'dataset_build')
| -rw-r--r-- | dataset_build/CMakeLists.txt | 11 | ||||
| -rw-r--r-- | dataset_build/based_sfh.conf | 3 | ||||
| -rw-r--r-- | dataset_build/based_sfh.py | 44 | ||||
| -rw-r--r-- | dataset_build/cal_information.conf | 5 | ||||
| -rw-r--r-- | dataset_build/cal_information.py | 133 | ||||
| -rw-r--r-- | dataset_build/dataset_build.conf | 8 | ||||
| -rw-r--r-- | dataset_build/dataset_build.py | 144 | ||||
| -rw-r--r-- | dataset_build/feature_statistics.conf | 8 | ||||
| -rw-r--r-- | dataset_build/feature_statistics.py | 164 | ||||
| -rw-r--r-- | dataset_build/file_digest.py | 96 | ||||
| -rw-r--r-- | dataset_build/get_lost.c | 116 | ||||
| -rw-r--r-- | dataset_build/grain.conf | 5 | ||||
| -rw-r--r-- | dataset_build/td_classification.py | 5 | ||||
| -rw-r--r-- | dataset_build/vedio_id_build.c | 171 |
14 files changed, 0 insertions, 913 deletions
diff --git a/dataset_build/CMakeLists.txt b/dataset_build/CMakeLists.txt deleted file mode 100644 index 8840a74..0000000 --- a/dataset_build/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -PROJECT (CALCULATE) -SET (SRC_LIST get_lost.c) -SET(CMAKE_BUILD_TYPE "Debug") -SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb") -SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") -MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR}) -MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR}) -#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/) -#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/) -ADD_EXECUTABLE(get_lost ${SRC_LIST}) -TARGET_LINK_LIBRARIES(get_lost maatframe libMESA_htable.so pthread m) diff --git a/dataset_build/based_sfh.conf b/dataset_build/based_sfh.conf deleted file mode 100644 index cdcf4cf..0000000 --- a/dataset_build/based_sfh.conf +++ /dev/null @@ -1,3 +0,0 @@ -[file] -raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest_nots -ripe_file_address = ../../data/td_data_set/td_data_20171207/base_sfh_set
\ No newline at end of file diff --git a/dataset_build/based_sfh.py b/dataset_build/based_sfh.py deleted file mode 100644 index b3281ce..0000000 --- a/dataset_build/based_sfh.py +++ /dev/null @@ -1,44 +0,0 @@ -import re -import ConfigParser -import bisect -import random - -term = {'not_null':(lambda x : len(x)!=0)} - -config = ConfigParser.RawConfigParser() -config.read("based_sfh.conf") -raw_file_address = config.get("file","raw_file_address") -ripe_file_address = config.get("file","ripe_file_address") - -class sfh_fingerprint(object): - - def __init__(self,sfh): - self.sfh = sfh - - @staticmethod - def get_hashed_len(sfh): - p = r"\[+\d+?:+\d+?\]" - pattern = re.compile(p) - hashed_len_set = pattern.findall(sfh) - if (term['not_null'](hashed_len_set)): - hashed_len = 0 - for x in xrange(0,len(hashed_len_set)): - hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) - hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) - return hashed_len/len(hashed_len_set) - else : - return -1 - -i=0 -sfh_set = list() -with open(raw_file_address,'r') as infile: - with open(ripe_file_address,'w') as outfile: - for line in infile: - i+=1 - if(i%100000==0): - print i - result = re.split(r';',line) - if(term['not_null'](result[3]) and term['not_null'](result[19])): - hashed_len = sfh_fingerprint.get_hashed_len(result[19]) - if(hashed_len/int(result[3])>0.8): - outfile.write(result[19]+'\n')
\ No newline at end of file diff --git a/dataset_build/cal_information.conf b/dataset_build/cal_information.conf deleted file mode 100644 index 1571b8b..0000000 --- a/dataset_build/cal_information.conf +++ /dev/null @@ -1,5 +0,0 @@ -[file] -raw_file_address = ../../data/ripe_data/td_data_20171207/video_id.txt -ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic -[feature] -feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/dataset_build/cal_information.py b/dataset_build/cal_information.py deleted file mode 100644 index 19cd95c..0000000 --- a/dataset_build/cal_information.py +++ /dev/null @@ -1,133 +0,0 @@ -import re -import numpy -import ConfigParser -import binascii -term = {'td_len':(lambda x : len(x)==32), - 'data_num':(lambda x : len(x)==4), - 'url':(lambda x : x.find['NUll']), - 'sfh_len':(lambda x : len(x)>20), - 'not_null':(lambda x : len(x)!=0)} - -class calculation(object): - """docstring for calculation""" - def __init__(self, arg): - super(calculation, self).__init__() - self.arg = arg - - @staticmethod - def cal_ent(x): - x_value_list = set([x[i] for i in range(x.shape[0])]) - ent = 0.0 - num_0 = x[x == 0].shape[0] - for x_value in x_value_list: - if(x_value==0): - continue - p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0) - logp = numpy.log2(p) - ent -=p*logp - return ent - -class data_value(object): - """docstring for data_value""" - def __init__(self, arg): - super(data_value, self).__init__() - self.arg = arg - - @staticmethod - def get_data_values(data): - data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) - #data_set[0]=null,data_set[1]=url - data_value_dic = [long(0)]*6 - for x in xrange(1,len(feature_list)+1): - if(x==1): - if(term['not_null'](data_set[x])==True): - data_value_dic[0] = binascii.crc32(data_set[x]) - else: - data_value_dic[0] = 0 - elif(x==2): - if(term['not_null'](data_set[x])==True): - data_value_dic[1] = binascii.crc32(data_set[x]) - else: - data_value_dic[1] = 0 - elif(x==3): - data_value_dic[2] = long(data_set[x]) - elif(x==4): - data_value_dic[3] = long(data_set[x]) - elif(x==5): - if(term['not_null'](data_set[x])==True): - data_value_dic[4] = binascii.crc32(data_set[x]) - else: - data_value_dic[4] = 0 - elif(x==6): - if(term['not_null'](data_set[x])==True): - data_value_dic[5] = binascii.crc32(data_set[x]) - else: - data_value_dic[5] = 0 - return data_value_dic - -config = ConfigParser.RawConfigParser() -config.read("cal_information.conf") - -raw_file_address = config.get("file","raw_file_address") -ripe_file_address = config.get("file","ripe_file_address") -feature_list =[i for i in config.get("feature","feature_name").split(",")] - -i=0 -with open(raw_file_address,'r') as infile: - for line in infile: - i+=1 - if(i%10000==0): - print i - if(i==50000): - break - line_split = re.split(";",line) - data_value_temp = data_value.get_data_values(line_split[5]) - data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)]) - data_value_temp.append(binascii.crc32(line_split[0])) - if(i==1): - a=numpy.array(data_value_temp) - else: - a=numpy.row_stack((a,numpy.array(data_value_temp))) - -for i in range(20): - if(i==0): - print "URL:"+str(calculation.cal_ent(a[:,i])) - elif(i==1): - print "ServerIP:"+str(calculation.cal_ent(a[:,i])) - elif(i==2): - print "MediaType:"+str(calculation.cal_ent(a[:,i])) - elif(i==3): - print "MediaLen:"+str(calculation.cal_ent(a[:,i])) - elif(i==4): - print "Etag:"+str(calculation.cal_ent(a[:,i])) - elif(i==5): - print "LastModify:"+str(calculation.cal_ent(a[:,i])) - elif(i==6): - print "td_0k:"+str(calculation.cal_ent(a[:,i])) - elif(i==7): - print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i])) - elif(i==8): - print "td_1k:"+str(calculation.cal_ent(a[:,i])) - elif(i==9): - print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i])) - elif(i==10): - print "td_2k:"+str(calculation.cal_ent(a[:,i])) - elif(i==11): - print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i])) - elif(i==12): - print "td_4k:"+str(calculation.cal_ent(a[:,i])) - elif(i==13): - print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i])) - elif(i==14): - print "td_8k:"+str(calculation.cal_ent(a[:,i])) - elif(i==15): - print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i])) - elif(i==16): - print "td_16k:"+str(calculation.cal_ent(a[:,i])) - elif(i==17): - print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i])) - elif(i==18): - print "td_32k:"+str(calculation.cal_ent(a[:,i])) - elif(i==19): - print "id:"+str(calculation.cal_ent(a[:,i])) - diff --git a/dataset_build/dataset_build.conf b/dataset_build/dataset_build.conf deleted file mode 100644 index 400e160..0000000 --- a/dataset_build/dataset_build.conf +++ /dev/null @@ -1,8 +0,0 @@ -[file] -raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level -ripe_file_address = ../../data/td_data_set/td_data_20171207/td_dataset -base_sfh_sets = ../../data/td_data_set/td_data_20171207/base_sfh_set -[output] -breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304 -[feature] -feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/dataset_build/dataset_build.py b/dataset_build/dataset_build.py deleted file mode 100644 index a832072..0000000 --- a/dataset_build/dataset_build.py +++ /dev/null @@ -1,144 +0,0 @@ -import re -import ConfigParser -import bisect -import random -import ctypes -import hashlib -import zlib -import binascii -import json -import datetime -import time - -term = {'td_len':(lambda x : len(x)==32), - 'data_num':(lambda x : len(x)==21), - 'url':(lambda x : x.find['NUll']), - 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), - 'not_null':(lambda x : len(x)!=0), - 'ysp_len':(lambda x : int(x)!=0), - 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} - -config = ConfigParser.RawConfigParser() -config.read("dataset_build.conf") -raw_file_address = config.get("file","raw_file_address") -ripe_file_address = config.get("file","ripe_file_address") -base_sfh_sets = config.get("file","base_sfh_sets") -breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] -feature_list =[i for i in config.get("feature","feature_name").split(",")] -ll=ctypes.cdll.LoadLibrary -lib = ll("libmaatframe.so") -lost = dict() - - -class data_value(object): - - @staticmethod - def get_feature(data): - return_data=list() - data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5]) - for x in xrange(1,21): - if(x==1): - if(term['not_null'](data_set[6])): - try: - time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8)) - data_set[6]=data_set[6][0:25] - time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S') - except Exception, e: - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - else: - return_data.append(str((time1-time2).seconds)) - return_data.append(((time1-time2).seconds)/60) - return_data.append(((time1-time2).seconds)/3600) - return_data.append((time1-time2).days) - else: - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - elif(x==2): - continue - elif(x==3): - continue - elif(x==4): - return_data.append(long(data[4])) - elif(x==5): - if(term['not_null'](data_set[1])): - return_data.append(len(data_set[1])) - else: - return_data.append(-1) - if(term['not_null'](data_set[2])): - ip_set=re.split(r'\.',data_set[2]) - return_data.append(ip_set[0]) - return_data.append(ip_set[1]) - return_data.append(ip_set[2]) - return_data.append(ip_set[3]) - else: - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - return_data.append(-1) - if(term['not_null'](data_set[3])): - return_data.append(int(data_set[3])) - else: - return_data.append(-1) - if(term['not_null'](data_set[5])): - return_data.append(binascii.crc32(data_set[5])) - else: - return_data.append(-1) - if(term['not_null'](data_set[6])): - return_data.append(binascii.crc32(data_set[6])) - else: - return_data.append(-1) - elif(x==7): - return_data.append(binascii.crc32(data[7])) - elif(x==9): - return_data.append(binascii.crc32(data[9])) - elif(x==11): - return_data.append(binascii.crc32(data[11])) - elif(x==13): - return_data.append(binascii.crc32(data[13])) - elif(x==15): - return_data.append(binascii.crc32(data[15])) - elif(x==17): - return_data.append(binascii.crc32(data[17])) - return return_data - # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) - # #data_set[0]=null,data_set[1]=url - # data_value_dic = dict() - # for x in xrange(1,len(feature_list)+1): - # if(x==1): - # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) - # elif(x==2): - # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) - # elif(x==3): - # data_value_dic[feature_list[x-1]] = data_set[x] - # elif(x==4): - # data_value_dic[feature_list[x-1]] = data_set[x] - # elif(x==5): - # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) - # elif(x==6): - # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) - # return data_value_dic - - -i=0 -sfh_set = list() -with open(raw_file_address,'r') as infile: - with open(ripe_file_address,'w') as outfile: - for line in infile: - i+=1 - if(i%10000==0): - print i - line_return = re.split(r';',line) - # if(int(line_return[0])==0): - # print 'td is right' - outfile.write(str(line_return[0])+',') - return_data=data_value.get_feature(line_return) - for x in range(19): - if(x==18): - outfile.write(str(return_data[18])+'\n') - else: - outfile.write(str(return_data[x])+',') diff --git a/dataset_build/feature_statistics.conf b/dataset_build/feature_statistics.conf deleted file mode 100644 index 12cf089..0000000 --- a/dataset_build/feature_statistics.conf +++ /dev/null @@ -1,8 +0,0 @@ -[file] -raw_file_address = ../../data/td_data_set/td_data_20171207/td.txt -ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic -[output] -breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,44194304 -[feature] -type = data_value_statistics -feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
\ No newline at end of file diff --git a/dataset_build/feature_statistics.py b/dataset_build/feature_statistics.py deleted file mode 100644 index 52ae8e0..0000000 --- a/dataset_build/feature_statistics.py +++ /dev/null @@ -1,164 +0,0 @@ -import re -import ConfigParser -import bisect -import random -import ctypes -import hashlib -import zlib -import binascii - -term = {'td_len':(lambda x : len(x)==32), - 'data_num':(lambda x : len(x)==4), - 'url':(lambda x : x.find['NUll']), - 'sfh_len':(lambda x : len(x)>20), - 'not_null':(lambda x : len(x)!=0)} - -class data_line(object): - """docstring for ClassName""" - def __init__(self): - super(ClassName, self).__init__() - - @staticmethod - def if_error(data_line_str): - data_line_val = re.split(r';',data_line_str) - hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) - if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\ - and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\ - and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\ - and hashed_len/float(data_line_val[3])>0.8): - return data_line_val - else: - return -1 - - -class feature_statistics(object): - """YSP feature_statistics""" - def __init__(self): - super(feature_statistics, self).__init__() - self.meida_len_statistics_set = [0,0,0,0,0,0,0] - self.lost_dict = dict() - - def meida_len_statistics(meida_len): - j = bisect.bisect(breakpoints,meida_len) - self.meida_len_statistics_set[j-1]+=1 - - def data_value_statistics(data_value_dic,data_value): - data_value_str = str() - for x in xrange(0,len(feature_list)): - data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+',' - - if(self.lost_dict.has_key(data_value_str)==False): - self.lost_dict[data_value_str]=[0,1,0.] - else: - if (int(result[3])==1): - self.lost_dict[data_value_str][0] += 1 - self.lost_dict[data_value_str][1] += 1 - else: - self.lost_dict[data_value_str][1] += 1 - - -class sfh_fingerprint(object): - - def __init__(self,sfh): - self.sfh = sfh - - @staticmethod - def get_hashed_len(sfh): - p = r"\[+\d+?:+\d+?\]" - pattern = re.compile(p) - hashed_len_set = pattern.findall(sfh) - if (term['not_null'](hashed_len_set)): - hashed_len = 0 - for x in xrange(0,len(hashed_len_set)): - hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) - hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) - return hashed_len/len(hashed_len_set) - else : - return -1 - - @staticmethod - def get_base_sfh(data_set): - base_sfh = list() - for x in xrange(0,10): - base_sfh.append(data_set[x]) - return base_sfh - - - - -class data_value(object): - - @staticmethod - def get_data_values(data): - data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) - #data_set[0]=null,data_set[1]=url - data_value_dic = dict() - for x in xrange(1,len(feature_list)+1): - if(x==1): - data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 - elif(x==2): - data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 - elif(x==3): - data_value_dic[feature_list[x-1]] = data_set[x] - elif(x==4): - data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x])) - elif(x==5): - data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 - elif(x==6): - data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1 - return data_value_dic - -config = ConfigParser.RawConfigParser() -config.read("feature_statistics.conf") - -feature_statistics_type = ("feature","type") -raw_file_address = config.get("file","raw_file_address") -ripe_file_address = config.get("file","ripe_file_address") - -if(feature_statistics_type=="meida_len_statistics"): - breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] -elif(feature_statistics_type=="data_value_statistics"): - feature_list =[i for i in config.get("feature","feature_name").split(",")] -# ll=ctypes.cdll.LoadLibrary -# lib = ll("libmaatframe.so") - -i=0 -sfh_set = list() -statistic = feature_statistics() -with open(raw_file_address,'r') as infile: - for line in infile: - i+=1 - - - - - line_return = data_line.if_error(line) - if(line_return != -1): - if(feature_statistics_type=="meida_len_statistics"): - statistic.meida_len_statistics(line_return[3]) - elif(feature_statistics_type=="data_value_statistics"): - lost_list = list() - statistic.meida_len_statistics(line_return) - for i in statistic.lost: - (statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1] - tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2])) - lost_list.append(tmp) - print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2])) - # if(x == len(feature_list)-1): - # outfile.write(data_value_dic[feature_list[x]]+'\n') - # else: - # print lost - # outfile.write(str(data_value_dic[feature_list[x]])+',') - # outfile.write(result[3]) - # sfh_dot=list() - # for x in xrange(0,10): - # #transform sfh to dot - # sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x]))) - # if(len(data_set)==7): - # outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\ - # +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\ - # +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\ - # +','+result[19]+'\n') - -# with open(ripe_file_address,'w') as outfile: -# outfile.write(str(lost)) diff --git a/dataset_build/file_digest.py b/dataset_build/file_digest.py deleted file mode 100644 index 590e059..0000000 --- a/dataset_build/file_digest.py +++ /dev/null @@ -1,96 +0,0 @@ -#-*-coding:utf-8-*- -import re -import random -import ConfigParser -import bisect -import commands -import os -import hashlib - -class data_line(object): - """docstring for ClassName""" - def __init__(self): - super(ClassName, self).__init__() - - @staticmethod - def if_error(data_line_str): - data_line_val = re.split(r';',data_line_str) - hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) - if(term['data_num'](data_line_val) and \ - term['not_null'](data_line_val[0]) and \ - term['ysp_len'](data_line_val[3]) and \ - term['not_null'](data_line_val[4]) and \ - term['td_len'](data_line_val[6]) and \ - term['td_len'](data_line_val[8]) and \ - term['td_len'](data_line_val[10]) and \ - term['td_len'](data_line_val[12]) and \ - term['td_len'](data_line_val[14]) and \ - term['td_len'](data_line_val[16]) and \ - term['not_null'](data_line_val[18]) and \ - term['sfh_len'](data_line_val[19]) and \ - term['not_null'](data_line_val[20]) and \ - hashed_len/float(data_line_val[3])>=0.8): - return data_line_val - else: - return -1 - -class TD_fingerprint(object): - def __init__(): - self.td = td - self.td_string = td_string - @staticmethod - def td_generate(td_string): - td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest() - -class sfh_fingerprint(object): - - def __init__(self,sfh): - self.sfh = sfh - - @staticmethod - def get_hashed_len(sfh): - p = r"\[+\d+?:+\d+?\]" - pattern = re.compile(p) - hashed_len_set = pattern.findall(sfh) - if (term['not_null'](hashed_len_set)): - hashed_len = 0 - for x in xrange(0,len(hashed_len_set)): - hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) - hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) - return hashed_len/len(hashed_len_set) - else : - return -1 - -term = {'td_len':(lambda x : len(x)==32), - 'data_num':(lambda x : len(x)==21), - 'url':(lambda x : x.find['NUll']), - 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), - 'not_null':(lambda x : len(x)!=0), - 'ysp_len':(lambda x : int(x)!=0), - 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} - -grain="./get_lost" -ripe_files=[] -config = ConfigParser.RawConfigParser() -config.read("grain.conf") -raw_file_address=config.get("file","raw_file_address") -ripe_files_address=config.get("file","ripe_files_address") -print ("%s %s" %(raw_file_address,ripe_files_address)) -num = [0,0,0,0,0,0,0] -breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] -# i=0 -# for i in xrange(0,ripe_file_num): -# outfile=open(ripe_files_address+str(i)+'.txt','w') -# ripe_files.append(outfile) - -i=0 -with open(raw_file_address,'r') as infile: -# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile: - with open(ripe_files_address,'w')as outfile: - for line in infile: - i+=1 - if(i%10000==0): - print i - line_return = data_line.if_error(line) - if(line_return != -1): - outfile.write(str(line))
\ No newline at end of file diff --git a/dataset_build/get_lost.c b/dataset_build/get_lost.c deleted file mode 100644 index 0e6c452..0000000 --- a/dataset_build/get_lost.c +++ /dev/null @@ -1,116 +0,0 @@ -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <MESA/MESA_htable.h> -#include <assert.h> -#include <ctype.h> -#define HTABLE_SIZE 8*64*1024*1024 -#define SFH_PASS_RATE 0.8 -#define SIMILIAR 80 - -typedef struct td -{ - char * tdstr; - unsigned int lost; -}td; - -typedef struct file_sfh_data -{ - long id; - char * sfh; - td * td_value; - char * td_ori; -}file_sfh_data; - -int main(int argc,char *argv[]) -{ - FILE *fpread;//文件 - FILE *fpwrite;//write file handle - int array_size = 1024; - file_sfh_data **file_data=(file_sfh_data **)malloc(sizeof(file_sfh_data)*array_size); - char* dirstr = "../../data/td_data_set/td_data_20171207/td_sfh_lost"; - //char* dirstr = *++argv; - char* writestr = "../../data/td_data_set/td_data_20171207/td.txt"; - int total_len = 0; - char TD_tmp[256], SFH_tmp[1024*300], TD_ORI[1024*10]; - char buffer[1024*300+1]; - int ret = 0; - int line = 0; - int thread_safe = 0; - int i; - int id; - int similiarity; - MESA_htable_handle htable = NULL; - fpread=fopen(dirstr,"rb"); - fpwrite=fopen(writestr,"w"); - printf("file str is %s\n",dirstr); - if(fpread==NULL) - { - printf("open file error\n"); - return -1; - } - buffer[sizeof(buffer)]='\0'; - while(feof(fpread)==0) - { - fgets(buffer,sizeof(buffer)-1,fpread); - ret=sscanf(buffer,"%d;%[^;];%[^;];%s",&total_len,TD_ORI,TD_tmp,SFH_tmp); - if(ret!=4) - { - continue; - } - file_data[line]=(file_sfh_data*)calloc(1,sizeof(file_sfh_data)); - file_data[line]->id=line; - file_data[line]->sfh=strdup(SFH_tmp); - file_data[line]->td_value=(td*)calloc(1,sizeof(td)); - file_data[line]->td_value->tdstr=strdup(TD_tmp); - file_data[line]->td_value->lost=0; - file_data[line]->td_ori=strdup(TD_ORI); - line++; - if(line==array_size) - { - array_size*=2; - file_data=realloc(file_data,sizeof(file_sfh_data)*array_size); - } - } - printf("read file success!\n"); - htable = NULL; - htable=MESA_htable_born(); - thread_safe = 0; - MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); - unsigned int slot_size=1024*1024*16; - MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(slot_size)); - MESA_htable_mature(htable); - for(i=0;i<line;i++) - { - if(MESA_htable_add(htable,(char*)(file_data[i]->td_value->tdstr),32,(void *)file_data[i]->id)<0) - { - id=(long)MESA_htable_search(htable,(char*)file_data[i]->td_value->tdstr,32); - similiarity=GIE_sfh_similiarity(file_data[id]->sfh,(int)strlen(file_data[id]->sfh),file_data[i]->sfh,(int)strlen(file_data[i]->sfh)); - if(similiarity<SIMILIAR) - { - file_data[id]->td_value->lost = 1; - file_data[i]->td_value->lost = 1; - } - } - } - for(i=0;i<line;i++) - { - fprintf(fpwrite,"%s;%s;%s;%d\n",file_data[i]->td_value->tdstr,file_data[i]->sfh,file_data[i]->td_ori,file_data[i]->td_value->lost); - } - for(i=0;i<line;i++) - { - free(file_data[i]->sfh); - file_data[i]->sfh=NULL; - free(file_data[i]->td_value->tdstr); - file_data[i]->td_value->tdstr=NULL; - free(file_data[i]->td_value); - file_data[i]->td_value=NULL; - free(file_data[i]->td_ori); - file_data[i]->td_ori=NULL; - free(file_data[i]); - file_data[i]=NULL; - } - fclose(fpread); - fclose(fpwrite); - return 0; -}
\ No newline at end of file diff --git a/dataset_build/grain.conf b/dataset_build/grain.conf deleted file mode 100644 index 944b337..0000000 --- a/dataset_build/grain.conf +++ /dev/null @@ -1,5 +0,0 @@ -[file] -ripe_files_address = ../../data/td_data_set/td_data_20171207/get_lost_raw_data -raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest -[output] -breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304
\ No newline at end of file diff --git a/dataset_build/td_classification.py b/dataset_build/td_classification.py deleted file mode 100644 index 8d4b97c..0000000 --- a/dataset_build/td_classification.py +++ /dev/null @@ -1,5 +0,0 @@ -from sklearn.datasets import load_iris -from sklearn import tree - -with open() as infile: -
\ No newline at end of file diff --git a/dataset_build/vedio_id_build.c b/dataset_build/vedio_id_build.c deleted file mode 100644 index 9faaa64..0000000 --- a/dataset_build/vedio_id_build.c +++ /dev/null @@ -1,171 +0,0 @@ -/* -gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc -*/ -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include "gram_index_engine.h" -#include <MESA/MESA_htable.h> -#include <assert.h> -#include <ctype.h> -#define BUFFER_LEN (10*1024) -#define SFH_PASS_RATE 0.9 -#define SFH_LEN (10*1024) -#define URL_LEN (10*1024) - -typedef struct video_id -{ - long id; - char *sfh; -}video_id; - -typedef struct cache -{ - GIE_digest_t ** GIE_cache; - long cache_size; - long len; -}cache; - -long get_hashed_len(const char* sfh) -{ - char *data=(char*)malloc(strlen(sfh)+1); - memcpy(data,sfh, strlen(sfh)); - data[strlen(sfh)]='\0'; - char *token=NULL,*sub_token=NULL,*saveptr; - long left_offset=0,right_offset=0,hashed_length=0; - int ret=0,first=0; - for (token = data; ;token= NULL) - { - sub_token= strtok_r(token,"[", &saveptr); - if (sub_token == NULL) - { - break; - } - if(first==0)//jump over the first sub string. - { - first=1; - continue; - } - ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); - if(ret!=2) - { - return 0; - } - assert(ret==2); - hashed_length+=right_offset-left_offset+1; - } - //printf("hashed length=%ld\n",hashed_length); - free(data); - return hashed_length/2; -} - -int main(int argc,char *argv[]) -{ - FILE *video_id_sets_file; - FILE *new_sfh_file; - const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data"; - const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt"; - char *buffer=NULL; - int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0; - int update = 0,video_id = 0,j = 0; - int* temp_int = NULL; - float temp_sfh_pass = 0; - char *sfh_str,*url_str; - GIE_digest_t *sfh_video_id = NULL; - GIE_result_t *query_result = NULL; - cache *GIE_digest_cache = NULL; - video_id_sets_file = fopen(video_id_sets_file_dir,"r+"); - new_sfh_file = fopen(new_sfh_file_dir,"w"); - if(video_id_sets_file == NULL) - { - printf("open video_id_sets_file error\n"); - return -1; - } - if(new_sfh_file == NULL) - { - printf("open new_sfh_file error\n"); - return -1; - } - buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); - GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t)); - query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t)); - GIE_handle_t *query_handle; - query_para->gram_value = 7; - query_para->position_accuracy = 5; - query_handle=GIE_create((const GIE_create_para_t *)query_para); - free(query_para); - if(query_handle==NULL) - { - printf("create GIE handle error\n"); - return -1; - } - sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); - sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); - url_str = (char*)calloc(URL_LEN,sizeof(char)); - i=0; - GIE_digest_cache =(cache*)calloc(1,sizeof(cache)); - GIE_digest_cache->cache_size = 1000; - GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*)); - GIE_digest_cache->len = 0; - while(feof(video_id_sets_file)==0) - { - i++; - if(i%10000==0) - { - printf("%d\n",i); - } - fgets(buffer,BUFFER_LEN-1,video_id_sets_file); - ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ - %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ - %*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str); - if(ret!=2) - { - continue; - } - hashed_len = get_hashed_len((const char*)sfh_str); - temp_sfh_pass = (float)hashed_len/total_len; - if(temp_sfh_pass<SFH_PASS_RATE) - { - continue; - } - resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1); - if(resultnum == 0) - { - temp_int=(int*)calloc(1,sizeof(int)); - *temp_int=i; - sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); - sfh_video_id->id=i; - sfh_video_id->sfh_length=strlen(sfh_str); - sfh_video_id->operation=GIE_INSERT_OPT; - sfh_video_id->cfds_lvl=5; - sfh_video_id->sfh=strdup(sfh_str); - sfh_video_id->tag=temp_int; - GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id; - GIE_digest_cache->len++; - if(GIE_digest_cache->len==GIE_digest_cache->cache_size) - { - update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size); - GIE_digest_cache->len=0; - for(j=0;j<GIE_digest_cache->cache_size;j++) - { - free(GIE_digest_cache->GIE_cache[j]->sfh); - GIE_digest_cache->GIE_cache[j]->sfh=NULL; - free(GIE_digest_cache->GIE_cache[j]); - GIE_digest_cache->GIE_cache[j]=NULL; - } - } - fprintf(new_sfh_file,"%d,%s",i,buffer); - } - else - { - fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer); - } - } - free(buffer); - free(query_result); - free(sfh_video_id); - free(url_str); - free(sfh_str); - free(GIE_digest_cache); - return 0; -}
\ No newline at end of file |
