summaryrefslogtreecommitdiff
path: root/src/dataset_build
diff options
context:
space:
mode:
Diffstat (limited to 'src/dataset_build')
-rw-r--r--src/dataset_build/CMakeLists.txt11
-rw-r--r--src/dataset_build/based_sfh.conf3
-rw-r--r--src/dataset_build/based_sfh.py44
-rw-r--r--src/dataset_build/cal_information.conf5
-rw-r--r--src/dataset_build/cal_information.py133
-rw-r--r--src/dataset_build/dataset_build.conf8
-rw-r--r--src/dataset_build/dataset_build.py144
-rw-r--r--src/dataset_build/feature_statistics.conf8
-rw-r--r--src/dataset_build/feature_statistics.py164
-rw-r--r--src/dataset_build/file_digest.py96
-rw-r--r--src/dataset_build/get_lost.c116
-rw-r--r--src/dataset_build/grain.conf5
-rw-r--r--src/dataset_build/td_classification.py5
-rw-r--r--src/dataset_build/vedio_id_build.c171
14 files changed, 913 insertions, 0 deletions
diff --git a/src/dataset_build/CMakeLists.txt b/src/dataset_build/CMakeLists.txt
new file mode 100644
index 0000000..8840a74
--- /dev/null
+++ b/src/dataset_build/CMakeLists.txt
@@ -0,0 +1,11 @@
+PROJECT (CALCULATE)
+SET (SRC_LIST get_lost.c)
+SET(CMAKE_BUILD_TYPE "Debug")
+SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb")
+SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
+MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR})
+MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR})
+#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/)
+#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/)
+ADD_EXECUTABLE(get_lost ${SRC_LIST})
+TARGET_LINK_LIBRARIES(get_lost maatframe libMESA_htable.so pthread m)
diff --git a/src/dataset_build/based_sfh.conf b/src/dataset_build/based_sfh.conf
new file mode 100644
index 0000000..cdcf4cf
--- /dev/null
+++ b/src/dataset_build/based_sfh.conf
@@ -0,0 +1,3 @@
+[file]
+raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest_nots
+ripe_file_address = ../../data/td_data_set/td_data_20171207/base_sfh_set \ No newline at end of file
diff --git a/src/dataset_build/based_sfh.py b/src/dataset_build/based_sfh.py
new file mode 100644
index 0000000..b3281ce
--- /dev/null
+++ b/src/dataset_build/based_sfh.py
@@ -0,0 +1,44 @@
+import re
+import ConfigParser
+import bisect
+import random
+
+term = {'not_null':(lambda x : len(x)!=0)}
+
+config = ConfigParser.RawConfigParser()
+config.read("based_sfh.conf")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+i=0
+sfh_set = list()
+with open(raw_file_address,'r') as infile:
+ with open(ripe_file_address,'w') as outfile:
+ for line in infile:
+ i+=1
+ if(i%100000==0):
+ print i
+ result = re.split(r';',line)
+ if(term['not_null'](result[3]) and term['not_null'](result[19])):
+ hashed_len = sfh_fingerprint.get_hashed_len(result[19])
+ if(hashed_len/int(result[3])>0.8):
+ outfile.write(result[19]+'\n') \ No newline at end of file
diff --git a/src/dataset_build/cal_information.conf b/src/dataset_build/cal_information.conf
new file mode 100644
index 0000000..1571b8b
--- /dev/null
+++ b/src/dataset_build/cal_information.conf
@@ -0,0 +1,5 @@
+[file]
+raw_file_address = ../../data/ripe_data/td_data_20171207/video_id.txt
+ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic
+[feature]
+feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify \ No newline at end of file
diff --git a/src/dataset_build/cal_information.py b/src/dataset_build/cal_information.py
new file mode 100644
index 0000000..19cd95c
--- /dev/null
+++ b/src/dataset_build/cal_information.py
@@ -0,0 +1,133 @@
+import re
+import numpy
+import ConfigParser
+import binascii
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==4),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20),
+ 'not_null':(lambda x : len(x)!=0)}
+
+class calculation(object):
+ """docstring for calculation"""
+ def __init__(self, arg):
+ super(calculation, self).__init__()
+ self.arg = arg
+
+ @staticmethod
+ def cal_ent(x):
+ x_value_list = set([x[i] for i in range(x.shape[0])])
+ ent = 0.0
+ num_0 = x[x == 0].shape[0]
+ for x_value in x_value_list:
+ if(x_value==0):
+ continue
+ p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0)
+ logp = numpy.log2(p)
+ ent -=p*logp
+ return ent
+
+class data_value(object):
+ """docstring for data_value"""
+ def __init__(self, arg):
+ super(data_value, self).__init__()
+ self.arg = arg
+
+ @staticmethod
+ def get_data_values(data):
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ #data_set[0]=null,data_set[1]=url
+ data_value_dic = [long(0)]*6
+ for x in xrange(1,len(feature_list)+1):
+ if(x==1):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[0] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[0] = 0
+ elif(x==2):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[1] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[1] = 0
+ elif(x==3):
+ data_value_dic[2] = long(data_set[x])
+ elif(x==4):
+ data_value_dic[3] = long(data_set[x])
+ elif(x==5):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[4] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[4] = 0
+ elif(x==6):
+ if(term['not_null'](data_set[x])==True):
+ data_value_dic[5] = binascii.crc32(data_set[x])
+ else:
+ data_value_dic[5] = 0
+ return data_value_dic
+
+config = ConfigParser.RawConfigParser()
+config.read("cal_information.conf")
+
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+feature_list =[i for i in config.get("feature","feature_name").split(",")]
+
+i=0
+with open(raw_file_address,'r') as infile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ if(i==50000):
+ break
+ line_split = re.split(";",line)
+ data_value_temp = data_value.get_data_values(line_split[5])
+ data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)])
+ data_value_temp.append(binascii.crc32(line_split[0]))
+ if(i==1):
+ a=numpy.array(data_value_temp)
+ else:
+ a=numpy.row_stack((a,numpy.array(data_value_temp)))
+
+for i in range(20):
+ if(i==0):
+ print "URL:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==1):
+ print "ServerIP:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==2):
+ print "MediaType:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==3):
+ print "MediaLen:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==4):
+ print "Etag:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==5):
+ print "LastModify:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==6):
+ print "td_0k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==7):
+ print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==8):
+ print "td_1k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==9):
+ print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==10):
+ print "td_2k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==11):
+ print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==12):
+ print "td_4k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==13):
+ print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==14):
+ print "td_8k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==15):
+ print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==16):
+ print "td_16k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==17):
+ print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==18):
+ print "td_32k:"+str(calculation.cal_ent(a[:,i]))
+ elif(i==19):
+ print "id:"+str(calculation.cal_ent(a[:,i]))
+
diff --git a/src/dataset_build/dataset_build.conf b/src/dataset_build/dataset_build.conf
new file mode 100644
index 0000000..400e160
--- /dev/null
+++ b/src/dataset_build/dataset_build.conf
@@ -0,0 +1,8 @@
+[file]
+raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level
+ripe_file_address = ../../data/td_data_set/td_data_20171207/td_dataset
+base_sfh_sets = ../../data/td_data_set/td_data_20171207/base_sfh_set
+[output]
+breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304
+[feature]
+feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify \ No newline at end of file
diff --git a/src/dataset_build/dataset_build.py b/src/dataset_build/dataset_build.py
new file mode 100644
index 0000000..a832072
--- /dev/null
+++ b/src/dataset_build/dataset_build.py
@@ -0,0 +1,144 @@
+import re
+import ConfigParser
+import bisect
+import random
+import ctypes
+import hashlib
+import zlib
+import binascii
+import json
+import datetime
+import time
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+config = ConfigParser.RawConfigParser()
+config.read("dataset_build.conf")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+base_sfh_sets = config.get("file","base_sfh_sets")
+breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+feature_list =[i for i in config.get("feature","feature_name").split(",")]
+ll=ctypes.cdll.LoadLibrary
+lib = ll("libmaatframe.so")
+lost = dict()
+
+
+class data_value(object):
+
+ @staticmethod
+ def get_feature(data):
+ return_data=list()
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5])
+ for x in xrange(1,21):
+ if(x==1):
+ if(term['not_null'](data_set[6])):
+ try:
+ time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8))
+ data_set[6]=data_set[6][0:25]
+ time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S')
+ except Exception, e:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ else:
+ return_data.append(str((time1-time2).seconds))
+ return_data.append(((time1-time2).seconds)/60)
+ return_data.append(((time1-time2).seconds)/3600)
+ return_data.append((time1-time2).days)
+ else:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ elif(x==2):
+ continue
+ elif(x==3):
+ continue
+ elif(x==4):
+ return_data.append(long(data[4]))
+ elif(x==5):
+ if(term['not_null'](data_set[1])):
+ return_data.append(len(data_set[1]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[2])):
+ ip_set=re.split(r'\.',data_set[2])
+ return_data.append(ip_set[0])
+ return_data.append(ip_set[1])
+ return_data.append(ip_set[2])
+ return_data.append(ip_set[3])
+ else:
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ return_data.append(-1)
+ if(term['not_null'](data_set[3])):
+ return_data.append(int(data_set[3]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[5])):
+ return_data.append(binascii.crc32(data_set[5]))
+ else:
+ return_data.append(-1)
+ if(term['not_null'](data_set[6])):
+ return_data.append(binascii.crc32(data_set[6]))
+ else:
+ return_data.append(-1)
+ elif(x==7):
+ return_data.append(binascii.crc32(data[7]))
+ elif(x==9):
+ return_data.append(binascii.crc32(data[9]))
+ elif(x==11):
+ return_data.append(binascii.crc32(data[11]))
+ elif(x==13):
+ return_data.append(binascii.crc32(data[13]))
+ elif(x==15):
+ return_data.append(binascii.crc32(data[15]))
+ elif(x==17):
+ return_data.append(binascii.crc32(data[17]))
+ return return_data
+ # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ # #data_set[0]=null,data_set[1]=url
+ # data_value_dic = dict()
+ # for x in xrange(1,len(feature_list)+1):
+ # if(x==1):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==2):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==3):
+ # data_value_dic[feature_list[x-1]] = data_set[x]
+ # elif(x==4):
+ # data_value_dic[feature_list[x-1]] = data_set[x]
+ # elif(x==5):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # elif(x==6):
+ # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
+ # return data_value_dic
+
+
+i=0
+sfh_set = list()
+with open(raw_file_address,'r') as infile:
+ with open(ripe_file_address,'w') as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = re.split(r';',line)
+ # if(int(line_return[0])==0):
+ # print 'td is right'
+ outfile.write(str(line_return[0])+',')
+ return_data=data_value.get_feature(line_return)
+ for x in range(19):
+ if(x==18):
+ outfile.write(str(return_data[18])+'\n')
+ else:
+ outfile.write(str(return_data[x])+',')
diff --git a/src/dataset_build/feature_statistics.conf b/src/dataset_build/feature_statistics.conf
new file mode 100644
index 0000000..12cf089
--- /dev/null
+++ b/src/dataset_build/feature_statistics.conf
@@ -0,0 +1,8 @@
+[file]
+raw_file_address = ../../data/td_data_set/td_data_20171207/td.txt
+ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic
+[output]
+breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,44194304
+[feature]
+type = data_value_statistics
+feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify \ No newline at end of file
diff --git a/src/dataset_build/feature_statistics.py b/src/dataset_build/feature_statistics.py
new file mode 100644
index 0000000..52ae8e0
--- /dev/null
+++ b/src/dataset_build/feature_statistics.py
@@ -0,0 +1,164 @@
+import re
+import ConfigParser
+import bisect
+import random
+import ctypes
+import hashlib
+import zlib
+import binascii
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==4),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20),
+ 'not_null':(lambda x : len(x)!=0)}
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\
+ and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\
+ and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\
+ and hashed_len/float(data_line_val[3])>0.8):
+ return data_line_val
+ else:
+ return -1
+
+
+class feature_statistics(object):
+ """YSP feature_statistics"""
+ def __init__(self):
+ super(feature_statistics, self).__init__()
+ self.meida_len_statistics_set = [0,0,0,0,0,0,0]
+ self.lost_dict = dict()
+
+ def meida_len_statistics(meida_len):
+ j = bisect.bisect(breakpoints,meida_len)
+ self.meida_len_statistics_set[j-1]+=1
+
+ def data_value_statistics(data_value_dic,data_value):
+ data_value_str = str()
+ for x in xrange(0,len(feature_list)):
+ data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+','
+
+ if(self.lost_dict.has_key(data_value_str)==False):
+ self.lost_dict[data_value_str]=[0,1,0.]
+ else:
+ if (int(result[3])==1):
+ self.lost_dict[data_value_str][0] += 1
+ self.lost_dict[data_value_str][1] += 1
+ else:
+ self.lost_dict[data_value_str][1] += 1
+
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+ @staticmethod
+ def get_base_sfh(data_set):
+ base_sfh = list()
+ for x in xrange(0,10):
+ base_sfh.append(data_set[x])
+ return base_sfh
+
+
+
+
+class data_value(object):
+
+ @staticmethod
+ def get_data_values(data):
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
+ #data_set[0]=null,data_set[1]=url
+ data_value_dic = dict()
+ for x in xrange(1,len(feature_list)+1):
+ if(x==1):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==2):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==3):
+ data_value_dic[feature_list[x-1]] = data_set[x]
+ elif(x==4):
+ data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x]))
+ elif(x==5):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ elif(x==6):
+ data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
+ return data_value_dic
+
+config = ConfigParser.RawConfigParser()
+config.read("feature_statistics.conf")
+
+feature_statistics_type = ("feature","type")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+
+if(feature_statistics_type=="meida_len_statistics"):
+ breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+elif(feature_statistics_type=="data_value_statistics"):
+ feature_list =[i for i in config.get("feature","feature_name").split(",")]
+# ll=ctypes.cdll.LoadLibrary
+# lib = ll("libmaatframe.so")
+
+i=0
+sfh_set = list()
+statistic = feature_statistics()
+with open(raw_file_address,'r') as infile:
+ for line in infile:
+ i+=1
+
+
+
+
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ if(feature_statistics_type=="meida_len_statistics"):
+ statistic.meida_len_statistics(line_return[3])
+ elif(feature_statistics_type=="data_value_statistics"):
+ lost_list = list()
+ statistic.meida_len_statistics(line_return)
+ for i in statistic.lost:
+ (statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1]
+ tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2]))
+ lost_list.append(tmp)
+ print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2]))
+ # if(x == len(feature_list)-1):
+ # outfile.write(data_value_dic[feature_list[x]]+'\n')
+ # else:
+ # print lost
+ # outfile.write(str(data_value_dic[feature_list[x]])+',')
+ # outfile.write(result[3])
+ # sfh_dot=list()
+ # for x in xrange(0,10):
+ # #transform sfh to dot
+ # sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x])))
+ # if(len(data_set)==7):
+ # outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\
+ # +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\
+ # +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\
+ # +','+result[19]+'\n')
+
+# with open(ripe_file_address,'w') as outfile:
+# outfile.write(str(lost))
diff --git a/src/dataset_build/file_digest.py b/src/dataset_build/file_digest.py
new file mode 100644
index 0000000..590e059
--- /dev/null
+++ b/src/dataset_build/file_digest.py
@@ -0,0 +1,96 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and \
+ term['not_null'](data_line_val[0]) and \
+ term['ysp_len'](data_line_val[3]) and \
+ term['not_null'](data_line_val[4]) and \
+ term['td_len'](data_line_val[6]) and \
+ term['td_len'](data_line_val[8]) and \
+ term['td_len'](data_line_val[10]) and \
+ term['td_len'](data_line_val[12]) and \
+ term['td_len'](data_line_val[14]) and \
+ term['td_len'](data_line_val[16]) and \
+ term['not_null'](data_line_val[18]) and \
+ term['sfh_len'](data_line_val[19]) and \
+ term['not_null'](data_line_val[20]) and \
+ hashed_len/float(data_line_val[3])>=0.8):
+ return data_line_val
+ else:
+ return -1
+
+class TD_fingerprint(object):
+ def __init__():
+ self.td = td
+ self.td_string = td_string
+ @staticmethod
+ def td_generate(td_string):
+ td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+grain="./get_lost"
+ripe_files=[]
+config = ConfigParser.RawConfigParser()
+config.read("grain.conf")
+raw_file_address=config.get("file","raw_file_address")
+ripe_files_address=config.get("file","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+num = [0,0,0,0,0,0,0]
+breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+# i=0
+# for i in xrange(0,ripe_file_num):
+# outfile=open(ripe_files_address+str(i)+'.txt','w')
+# ripe_files.append(outfile)
+
+i=0
+with open(raw_file_address,'r') as infile:
+# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile:
+ with open(ripe_files_address,'w')as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ outfile.write(str(line)) \ No newline at end of file
diff --git a/src/dataset_build/get_lost.c b/src/dataset_build/get_lost.c
new file mode 100644
index 0000000..0e6c452
--- /dev/null
+++ b/src/dataset_build/get_lost.c
@@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define HTABLE_SIZE 8*64*1024*1024
+#define SFH_PASS_RATE 0.8
+#define SIMILIAR 80
+
+typedef struct td
+{
+ char * tdstr;
+ unsigned int lost;
+}td;
+
+typedef struct file_sfh_data
+{
+ long id;
+ char * sfh;
+ td * td_value;
+ char * td_ori;
+}file_sfh_data;
+
+int main(int argc,char *argv[])
+{
+ FILE *fpread;//文件
+ FILE *fpwrite;//write file handle
+ int array_size = 1024;
+ file_sfh_data **file_data=(file_sfh_data **)malloc(sizeof(file_sfh_data)*array_size);
+ char* dirstr = "../../data/td_data_set/td_data_20171207/td_sfh_lost";
+ //char* dirstr = *++argv;
+ char* writestr = "../../data/td_data_set/td_data_20171207/td.txt";
+ int total_len = 0;
+ char TD_tmp[256], SFH_tmp[1024*300], TD_ORI[1024*10];
+ char buffer[1024*300+1];
+ int ret = 0;
+ int line = 0;
+ int thread_safe = 0;
+ int i;
+ int id;
+ int similiarity;
+ MESA_htable_handle htable = NULL;
+ fpread=fopen(dirstr,"rb");
+ fpwrite=fopen(writestr,"w");
+ printf("file str is %s\n",dirstr);
+ if(fpread==NULL)
+ {
+ printf("open file error\n");
+ return -1;
+ }
+ buffer[sizeof(buffer)]='\0';
+ while(feof(fpread)==0)
+ {
+ fgets(buffer,sizeof(buffer)-1,fpread);
+ ret=sscanf(buffer,"%d;%[^;];%[^;];%s",&total_len,TD_ORI,TD_tmp,SFH_tmp);
+ if(ret!=4)
+ {
+ continue;
+ }
+ file_data[line]=(file_sfh_data*)calloc(1,sizeof(file_sfh_data));
+ file_data[line]->id=line;
+ file_data[line]->sfh=strdup(SFH_tmp);
+ file_data[line]->td_value=(td*)calloc(1,sizeof(td));
+ file_data[line]->td_value->tdstr=strdup(TD_tmp);
+ file_data[line]->td_value->lost=0;
+ file_data[line]->td_ori=strdup(TD_ORI);
+ line++;
+ if(line==array_size)
+ {
+ array_size*=2;
+ file_data=realloc(file_data,sizeof(file_sfh_data)*array_size);
+ }
+ }
+ printf("read file success!\n");
+ htable = NULL;
+ htable=MESA_htable_born();
+ thread_safe = 0;
+ MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
+ unsigned int slot_size=1024*1024*16;
+ MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(slot_size));
+ MESA_htable_mature(htable);
+ for(i=0;i<line;i++)
+ {
+ if(MESA_htable_add(htable,(char*)(file_data[i]->td_value->tdstr),32,(void *)file_data[i]->id)<0)
+ {
+ id=(long)MESA_htable_search(htable,(char*)file_data[i]->td_value->tdstr,32);
+ similiarity=GIE_sfh_similiarity(file_data[id]->sfh,(int)strlen(file_data[id]->sfh),file_data[i]->sfh,(int)strlen(file_data[i]->sfh));
+ if(similiarity<SIMILIAR)
+ {
+ file_data[id]->td_value->lost = 1;
+ file_data[i]->td_value->lost = 1;
+ }
+ }
+ }
+ for(i=0;i<line;i++)
+ {
+ fprintf(fpwrite,"%s;%s;%s;%d\n",file_data[i]->td_value->tdstr,file_data[i]->sfh,file_data[i]->td_ori,file_data[i]->td_value->lost);
+ }
+ for(i=0;i<line;i++)
+ {
+ free(file_data[i]->sfh);
+ file_data[i]->sfh=NULL;
+ free(file_data[i]->td_value->tdstr);
+ file_data[i]->td_value->tdstr=NULL;
+ free(file_data[i]->td_value);
+ file_data[i]->td_value=NULL;
+ free(file_data[i]->td_ori);
+ file_data[i]->td_ori=NULL;
+ free(file_data[i]);
+ file_data[i]=NULL;
+ }
+ fclose(fpread);
+ fclose(fpwrite);
+ return 0;
+} \ No newline at end of file
diff --git a/src/dataset_build/grain.conf b/src/dataset_build/grain.conf
new file mode 100644
index 0000000..944b337
--- /dev/null
+++ b/src/dataset_build/grain.conf
@@ -0,0 +1,5 @@
+[file]
+ripe_files_address = ../../data/td_data_set/td_data_20171207/get_lost_raw_data
+raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest
+[output]
+breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304 \ No newline at end of file
diff --git a/src/dataset_build/td_classification.py b/src/dataset_build/td_classification.py
new file mode 100644
index 0000000..8d4b97c
--- /dev/null
+++ b/src/dataset_build/td_classification.py
@@ -0,0 +1,5 @@
+from sklearn.datasets import load_iris
+from sklearn import tree
+
+with open() as infile:
+ \ No newline at end of file
diff --git a/src/dataset_build/vedio_id_build.c b/src/dataset_build/vedio_id_build.c
new file mode 100644
index 0000000..9faaa64
--- /dev/null
+++ b/src/dataset_build/vedio_id_build.c
@@ -0,0 +1,171 @@
+/*
+gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define BUFFER_LEN (10*1024)
+#define SFH_PASS_RATE 0.9
+#define SFH_LEN (10*1024)
+#define URL_LEN (10*1024)
+
+typedef struct video_id
+{
+ long id;
+ char *sfh;
+}video_id;
+
+typedef struct cache
+{
+ GIE_digest_t ** GIE_cache;
+ long cache_size;
+ long len;
+}cache;
+
+long get_hashed_len(const char* sfh)
+{
+ char *data=(char*)malloc(strlen(sfh)+1);
+ memcpy(data,sfh, strlen(sfh));
+ data[strlen(sfh)]='\0';
+ char *token=NULL,*sub_token=NULL,*saveptr;
+ long left_offset=0,right_offset=0,hashed_length=0;
+ int ret=0,first=0;
+ for (token = data; ;token= NULL)
+ {
+ sub_token= strtok_r(token,"[", &saveptr);
+ if (sub_token == NULL)
+ {
+ break;
+ }
+ if(first==0)//jump over the first sub string.
+ {
+ first=1;
+ continue;
+ }
+ ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
+ if(ret!=2)
+ {
+ return 0;
+ }
+ assert(ret==2);
+ hashed_length+=right_offset-left_offset+1;
+ }
+ //printf("hashed length=%ld\n",hashed_length);
+ free(data);
+ return hashed_length/2;
+}
+
+int main(int argc,char *argv[])
+{
+ FILE *video_id_sets_file;
+ FILE *new_sfh_file;
+ const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data";
+ const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt";
+ char *buffer=NULL;
+ int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0;
+ int update = 0,video_id = 0,j = 0;
+ int* temp_int = NULL;
+ float temp_sfh_pass = 0;
+ char *sfh_str,*url_str;
+ GIE_digest_t *sfh_video_id = NULL;
+ GIE_result_t *query_result = NULL;
+ cache *GIE_digest_cache = NULL;
+ video_id_sets_file = fopen(video_id_sets_file_dir,"r+");
+ new_sfh_file = fopen(new_sfh_file_dir,"w");
+ if(video_id_sets_file == NULL)
+ {
+ printf("open video_id_sets_file error\n");
+ return -1;
+ }
+ if(new_sfh_file == NULL)
+ {
+ printf("open new_sfh_file error\n");
+ return -1;
+ }
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t));
+ query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t));
+ GIE_handle_t *query_handle;
+ query_para->gram_value = 7;
+ query_para->position_accuracy = 5;
+ query_handle=GIE_create((const GIE_create_para_t *)query_para);
+ free(query_para);
+ if(query_handle==NULL)
+ {
+ printf("create GIE handle error\n");
+ return -1;
+ }
+ sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ url_str = (char*)calloc(URL_LEN,sizeof(char));
+ i=0;
+ GIE_digest_cache =(cache*)calloc(1,sizeof(cache));
+ GIE_digest_cache->cache_size = 1000;
+ GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*));
+ GIE_digest_cache->len = 0;
+ while(feof(video_id_sets_file)==0)
+ {
+ i++;
+ if(i%10000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,video_id_sets_file);
+ ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ %*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str);
+ if(ret!=2)
+ {
+ continue;
+ }
+ hashed_len = get_hashed_len((const char*)sfh_str);
+ temp_sfh_pass = (float)hashed_len/total_len;
+ if(temp_sfh_pass<SFH_PASS_RATE)
+ {
+ continue;
+ }
+ resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1);
+ if(resultnum == 0)
+ {
+ temp_int=(int*)calloc(1,sizeof(int));
+ *temp_int=i;
+ sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_video_id->id=i;
+ sfh_video_id->sfh_length=strlen(sfh_str);
+ sfh_video_id->operation=GIE_INSERT_OPT;
+ sfh_video_id->cfds_lvl=5;
+ sfh_video_id->sfh=strdup(sfh_str);
+ sfh_video_id->tag=temp_int;
+ GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id;
+ GIE_digest_cache->len++;
+ if(GIE_digest_cache->len==GIE_digest_cache->cache_size)
+ {
+ update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size);
+ GIE_digest_cache->len=0;
+ for(j=0;j<GIE_digest_cache->cache_size;j++)
+ {
+ free(GIE_digest_cache->GIE_cache[j]->sfh);
+ GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ free(GIE_digest_cache->GIE_cache[j]);
+ GIE_digest_cache->GIE_cache[j]=NULL;
+ }
+ }
+ fprintf(new_sfh_file,"%d,%s",i,buffer);
+ }
+ else
+ {
+ fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer);
+ }
+ }
+ free(buffer);
+ free(query_result);
+ free(sfh_video_id);
+ free(url_str);
+ free(sfh_str);
+ free(GIE_digest_cache);
+ return 0;
+} \ No newline at end of file