添加inc和srcHEAD master

author: 陈冠林 <[email protected]> 2019-06-18 10:44:20 +0800
committer: 陈冠林 <[email protected]> 2019-06-18 10:44:20 +0800
commit: b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
tree: b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /dataset_build
parent: b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
14 files changed, 0 insertions, 913 deletions
diff --git a/dataset_build/CMakeLists.txt b/dataset_build/CMakeLists.txt
deleted file mode 100644
index 8840a74..0000000
--- a/dataset_build/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PROJECT (CALCULATE)
-SET (SRC_LIST get_lost.c)
-SET(CMAKE_BUILD_TYPE "Debug")
-SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb")
-SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
-MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR})
-MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR})
-#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/)
-#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/)
-ADD_EXECUTABLE(get_lost ${SRC_LIST})
-TARGET_LINK_LIBRARIES(get_lost maatframe libMESA_htable.so pthread m)
diff --git a/dataset_build/based_sfh.conf b/dataset_build/based_sfh.conf
deleted file mode 100644
index cdcf4cf..0000000
--- a/dataset_build/based_sfh.conf
+++ /dev/null
@@ -1,3 +0,0 @@
-[file]
-raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest_nots
-ripe_file_address = ../../data/td_data_set/td_data_20171207/base_sfh_set
-\ No newline at end of file
diff --git a/dataset_build/based_sfh.py b/dataset_build/based_sfh.py
deleted file mode 100644
index b3281ce..0000000
--- a/dataset_build/based_sfh.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import re
-import ConfigParser
-import bisect
-import random
-
-term = {'not_null':(lambda x : len(x)!=0)}
-	
-config = ConfigParser.RawConfigParser()
-config.read("based_sfh.conf")
-raw_file_address = config.get("file","raw_file_address")
-ripe_file_address = config.get("file","ripe_file_address")
-
-class sfh_fingerprint(object):
-	
-	def __init__(self,sfh):
-		self.sfh = sfh
-
-	@staticmethod
-	def get_hashed_len(sfh):
-		p = r"\[+\d+?:+\d+?\]"
-		pattern = re.compile(p)
-		hashed_len_set = pattern.findall(sfh)
-		if (term['not_null'](hashed_len_set)):
-			hashed_len = 0
-			for x in xrange(0,len(hashed_len_set)):
-				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
-				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
-			return hashed_len/len(hashed_len_set)
-		else :
-			return -1
-
-i=0
-sfh_set = list()
-with open(raw_file_address,'r') as infile:
-	with open(ripe_file_address,'w') as outfile: 
-		for line in infile:
-			i+=1
-			if(i%100000==0):
-				print i
-			result = re.split(r';',line)
-			if(term['not_null'](result[3]) and term['not_null'](result[19])):
-				hashed_len = sfh_fingerprint.get_hashed_len(result[19])
-				if(hashed_len/int(result[3])>0.8):
-					outfile.write(result[19]+'\n')
-\ No newline at end of file
diff --git a/dataset_build/cal_information.conf b/dataset_build/cal_information.conf
deleted file mode 100644
index 1571b8b..0000000
--- a/dataset_build/cal_information.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-[file]
-raw_file_address = ../../data/ripe_data/td_data_20171207/video_id.txt
-ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic
-[feature]
-feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
-\ No newline at end of file
diff --git a/dataset_build/cal_information.py b/dataset_build/cal_information.py
deleted file mode 100644
index 19cd95c..0000000
--- a/dataset_build/cal_information.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import re
-import numpy
-import ConfigParser
-import binascii
-term = {'td_len':(lambda x : len(x)==32),
-		'data_num':(lambda x : len(x)==4),
-		'url':(lambda x : x.find['NUll']),
-		'sfh_len':(lambda x : len(x)>20),
-		'not_null':(lambda x : len(x)!=0)} 
-
-class calculation(object):
-	"""docstring for calculation"""
-	def __init__(self, arg):
-		super(calculation, self).__init__()
-		self.arg = arg
-
-	@staticmethod
-	def cal_ent(x):
-		x_value_list = set([x[i] for i in range(x.shape[0])])
-		ent = 0.0
-		num_0 = x[x == 0].shape[0]
-		for x_value in x_value_list:
-			if(x_value==0):
-				continue
-			p = float(x[x == x_value].shape[0])/(x.shape[0]- num_0)
-			logp = numpy.log2(p)
-			ent -=p*logp
-		return ent
-
-class data_value(object):
-	"""docstring for data_value"""
-	def __init__(self, arg):
-		super(data_value, self).__init__()
-		self.arg = arg
-	
-	@staticmethod
-	def get_data_values(data):
-		data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
-		#data_set[0]=null,data_set[1]=url
-		data_value_dic = [long(0)]*6
-		for x in xrange(1,len(feature_list)+1):
-			if(x==1):
-				if(term['not_null'](data_set[x])==True):
-					data_value_dic[0] = binascii.crc32(data_set[x])
-				else:
-					data_value_dic[0] = 0
-			elif(x==2):
-				if(term['not_null'](data_set[x])==True):
-					data_value_dic[1] = binascii.crc32(data_set[x])
-				else:
-					data_value_dic[1] = 0
-			elif(x==3):
-				data_value_dic[2] = long(data_set[x])
-			elif(x==4):
-				data_value_dic[3] = long(data_set[x])
-			elif(x==5):
-				if(term['not_null'](data_set[x])==True):
-					data_value_dic[4] = binascii.crc32(data_set[x])
-				else:
-					data_value_dic[4] = 0
-			elif(x==6):
-				if(term['not_null'](data_set[x])==True):
-					data_value_dic[5] = binascii.crc32(data_set[x])
-				else:
-					data_value_dic[5] = 0
-		return data_value_dic
-
-config = ConfigParser.RawConfigParser()
-config.read("cal_information.conf")
-
-raw_file_address = config.get("file","raw_file_address")
-ripe_file_address = config.get("file","ripe_file_address")
-feature_list =[i for i in config.get("feature","feature_name").split(",")]
-
-i=0
-with open(raw_file_address,'r') as infile:
-	for line in infile:
-		i+=1
-		if(i%10000==0):
-			print i
-		if(i==50000):
-			break
-		line_split = re.split(";",line)
-		data_value_temp = data_value.get_data_values(line_split[5])
-		data_value_temp.extend([binascii.crc32(line_split[j]) for j in range(6,19)])
-		data_value_temp.append(binascii.crc32(line_split[0]))
-		if(i==1):
-			a=numpy.array(data_value_temp)
-		else:
-			a=numpy.row_stack((a,numpy.array(data_value_temp)))
-	
-for i in range(20):
-	if(i==0):
-		print "URL:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==1):
-		print "ServerIP:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==2):
-		print "MediaType:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==3):
-		print "MediaLen:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==4):
-		print "Etag:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==5):
-		print "LastModify:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==6):
-		print "td_0k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==7):
-		print "td_data_md5_1k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==8):
-		print "td_1k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==9):
-		print "td_data_md5_2k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==10):
-		print "td_2k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==11):
-		print "td_data_md5_4k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==12):
-		print "td_4k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==13):
-		print "td_data_md5_8k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==14):
-		print "td_8k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==15):
-		print "td_data_md5_16k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==16):
-		print "td_16k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==17):
-		print "td_data_md5_32k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==18):
-		print "td_32k:"+str(calculation.cal_ent(a[:,i]))
-	elif(i==19):
-		print "id:"+str(calculation.cal_ent(a[:,i]))
-	
diff --git a/dataset_build/dataset_build.conf b/dataset_build/dataset_build.conf
deleted file mode 100644
index 400e160..0000000
--- a/dataset_build/dataset_build.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-[file]
-raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level
-ripe_file_address = ../../data/td_data_set/td_data_20171207/td_dataset
-base_sfh_sets = ../../data/td_data_set/td_data_20171207/base_sfh_set
-[output]
-breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304
-[feature]
-feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
-\ No newline at end of file
diff --git a/dataset_build/dataset_build.py b/dataset_build/dataset_build.py
deleted file mode 100644
index a832072..0000000
--- a/dataset_build/dataset_build.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import re
-import ConfigParser
-import bisect
-import random
-import ctypes
-import hashlib
-import zlib
-import binascii
-import json
-import datetime
-import time
-
-term = {'td_len':(lambda x : len(x)==32),
-		'data_num':(lambda x : len(x)==21),
-		'url':(lambda x : x.find['NUll']),
-		'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
-		'not_null':(lambda x : len(x)!=0),
-		'ysp_len':(lambda x : int(x)!=0),
-		'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} 
-	
-config = ConfigParser.RawConfigParser()
-config.read("dataset_build.conf")
-raw_file_address = config.get("file","raw_file_address")
-ripe_file_address = config.get("file","ripe_file_address")
-base_sfh_sets = config.get("file","base_sfh_sets")
-breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
-feature_list =[i for i in config.get("feature","feature_name").split(",")]
-ll=ctypes.cdll.LoadLibrary
-lib = ll("libmaatframe.so")
-lost = dict()
-
-
-class data_value(object):
-	
-	@staticmethod
-	def get_feature(data):
-		return_data=list()
-		data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5])
-		for x in xrange(1,21):
-			if(x==1):
-				if(term['not_null'](data_set[6])):
-					try:
-						time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8))
-						data_set[6]=data_set[6][0:25]
-						time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S')
-					except Exception, e:
-						return_data.append(-1)
-						return_data.append(-1)
-						return_data.append(-1)
-						return_data.append(-1)
-					else:
-						return_data.append(str((time1-time2).seconds))
-						return_data.append(((time1-time2).seconds)/60)
-						return_data.append(((time1-time2).seconds)/3600)
-						return_data.append((time1-time2).days)
-				else:
-					return_data.append(-1)
-					return_data.append(-1)
-					return_data.append(-1)
-					return_data.append(-1)
-			elif(x==2):
-				continue
-			elif(x==3):
-				continue
-			elif(x==4):
-				return_data.append(long(data[4]))
-			elif(x==5):
-				if(term['not_null'](data_set[1])):
-					return_data.append(len(data_set[1]))
-				else:
-					return_data.append(-1)
-				if(term['not_null'](data_set[2])):
-					ip_set=re.split(r'\.',data_set[2])
-					return_data.append(ip_set[0])
-					return_data.append(ip_set[1])
-					return_data.append(ip_set[2])
-					return_data.append(ip_set[3])
-				else:
-					return_data.append(-1)
-					return_data.append(-1)
-					return_data.append(-1)
-					return_data.append(-1)
-				if(term['not_null'](data_set[3])):
-					return_data.append(int(data_set[3]))
-				else:
-					return_data.append(-1)
-				if(term['not_null'](data_set[5])):
-					return_data.append(binascii.crc32(data_set[5]))
-				else:
-					return_data.append(-1)
-				if(term['not_null'](data_set[6])):
-					return_data.append(binascii.crc32(data_set[6]))
-				else:
-					return_data.append(-1)
-			elif(x==7):
-				return_data.append(binascii.crc32(data[7]))
-			elif(x==9):
-				return_data.append(binascii.crc32(data[9]))
-			elif(x==11):
-				return_data.append(binascii.crc32(data[11]))
-			elif(x==13):
-				return_data.append(binascii.crc32(data[13]))
-			elif(x==15):
-				return_data.append(binascii.crc32(data[15]))
-			elif(x==17):
-				return_data.append(binascii.crc32(data[17]))
-		return return_data
-		# data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
-		# #data_set[0]=null,data_set[1]=url
-		# data_value_dic = dict()
-		# for x in xrange(1,len(feature_list)+1):
-		# 	if(x==1):
-		# 		data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
-		# 	elif(x==2):
-		# 		data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
-		# 	elif(x==3):
-		# 		data_value_dic[feature_list[x-1]] = data_set[x]
-		# 	elif(x==4):
-		# 		data_value_dic[feature_list[x-1]] = data_set[x]
-		# 	elif(x==5):
-		# 		data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
-		# 	elif(x==6):
-		# 		data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x])
-		# return data_value_dic
-			
-
-i=0
-sfh_set = list()
-with open(raw_file_address,'r') as infile:
-	with open(ripe_file_address,'w') as outfile:
-		for line in infile:
-			i+=1
-			if(i%10000==0):
-				print i
-			line_return = re.split(r';',line)
-			# if(int(line_return[0])==0):
-			# 	print 'td is right'
-			outfile.write(str(line_return[0])+',')
-			return_data=data_value.get_feature(line_return)
-			for x in range(19):
-				if(x==18):
-					outfile.write(str(return_data[18])+'\n')
-				else:
-					outfile.write(str(return_data[x])+',')
diff --git a/dataset_build/feature_statistics.conf b/dataset_build/feature_statistics.conf
deleted file mode 100644
index 12cf089..0000000
--- a/dataset_build/feature_statistics.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-[file]
-raw_file_address = ../../data/td_data_set/td_data_20171207/td.txt
-ripe_file_address = ../../data/td_data_set/td_data_20171207/td_data_set_statistic
-[output]
-breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,44194304
-[feature]
-type = data_value_statistics
-feature_name = URL,ServerIP,MediaType,MediaLen,Etag,LastModify
-\ No newline at end of file
diff --git a/dataset_build/feature_statistics.py b/dataset_build/feature_statistics.py
deleted file mode 100644
index 52ae8e0..0000000
--- a/dataset_build/feature_statistics.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import re
-import ConfigParser
-import bisect
-import random
-import ctypes
-import hashlib
-import zlib
-import binascii
-
-term = {'td_len':(lambda x : len(x)==32),
-		'data_num':(lambda x : len(x)==4),
-		'url':(lambda x : x.find['NUll']),
-		'sfh_len':(lambda x : len(x)>20),
-		'not_null':(lambda x : len(x)!=0)} 
-
-class data_line(object):
-	"""docstring for ClassName"""
-	def __init__(self):
-		super(ClassName, self).__init__()
-	
-	@staticmethod
-	def  if_error(data_line_str):
-		data_line_val = re.split(r';',data_line_str)
-		hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
-		if(term['data_num'](data_line_val) and term['sfh_len'](data_line_val[19]) and term['td_len'](data_line_val[9])\
-		and term['td_len'](data_line_val[2]) and term['td_len'](data_line_val[13]) and term['td_len'](data_line_val[15])\
-		and term['td_len'](data_line_val[17]) and term['not_null'](data_line_val[18]) and term['not_null'](data_line_val[19])\
-		and hashed_len/float(data_line_val[3])>0.8):
-			return data_line_val
-		else:
-			return -1
-		
-
-class feature_statistics(object):
-	"""YSP feature_statistics"""
-	def __init__(self):
-		super(feature_statistics, self).__init__()
-		self.meida_len_statistics_set = [0,0,0,0,0,0,0]
-		self.lost_dict = dict()
-
-	def meida_len_statistics(meida_len):
-			j = bisect.bisect(breakpoints,meida_len)
-			self.meida_len_statistics_set[j-1]+=1
-
-	def data_value_statistics(data_value_dic,data_value):
-		data_value_str = str()
-		for x in xrange(0,len(feature_list)):
-			data_value_str = data_value_str+str(data_value_dic[feature_list[x]])+','
-
-		if(self.lost_dict.has_key(data_value_str)==False):
-				self.lost_dict[data_value_str]=[0,1,0.]
-		else:
-			if (int(result[3])==1):
-				self.lost_dict[data_value_str][0] += 1
-				self.lost_dict[data_value_str][1] += 1
-			else:
-				self.lost_dict[data_value_str][1] += 1
-		
-
-class sfh_fingerprint(object):
-	
-	def __init__(self,sfh):
-		self.sfh = sfh
-
-	@staticmethod
-	def get_hashed_len(sfh):
-		p = r"\[+\d+?:+\d+?\]"
-		pattern = re.compile(p)
-		hashed_len_set = pattern.findall(sfh)
-		if (term['not_null'](hashed_len_set)):
-			hashed_len = 0
-			for x in xrange(0,len(hashed_len_set)):
-				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
-				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
-			return hashed_len/len(hashed_len_set)
-		else :
-			return -1
-	
-	@staticmethod
-	def get_base_sfh(data_set):
-		base_sfh = list()
-		for x in xrange(0,10):
-			base_sfh.append(data_set[x])
-		return base_sfh
-			
-
-
-
-class data_value(object):
-	
-	@staticmethod
-	def get_data_values(data):
-		data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data)
-		#data_set[0]=null,data_set[1]=url
-		data_value_dic = dict()
-		for x in xrange(1,len(feature_list)+1):
-			if(x==1):
-				data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
-			elif(x==2):
-				data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
-			elif(x==3):
-				data_value_dic[feature_list[x-1]] = data_set[x]
-			elif(x==4):
-				data_value_dic[feature_list[x-1]] = bisect.bisect(breakpoints,int(data_set[x]))
-			elif(x==5):
-				data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
-			elif(x==6):
-				data_value_dic[feature_list[x-1]] = 0 if(term['not_null']==False) else 1
-		return data_value_dic
-				
-config = ConfigParser.RawConfigParser()
-config.read("feature_statistics.conf")
-
-feature_statistics_type = ("feature","type")
-raw_file_address = config.get("file","raw_file_address")
-ripe_file_address = config.get("file","ripe_file_address")
-
-if(feature_statistics_type=="meida_len_statistics"):
-	breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
-elif(feature_statistics_type=="data_value_statistics"):
-	feature_list =[i for i in config.get("feature","feature_name").split(",")]
-# ll=ctypes.cdll.LoadLibrary
-# lib = ll("libmaatframe.so")
-
-i=0
-sfh_set = list()
-statistic = feature_statistics()
-with open(raw_file_address,'r') as infile:
-	for line in infile:
-		i+=1
-		
-
-
-		
-		line_return = data_line.if_error(line)
-		if(line_return != -1):
-			if(feature_statistics_type=="meida_len_statistics"):
-				statistic.meida_len_statistics(line_return[3])
-			elif(feature_statistics_type=="data_value_statistics"):
-				lost_list = list()
-				statistic.meida_len_statistics(line_return)
-				for i in statistic.lost:
-					(statistic.lost[i])[2] = float((statistic.lost[i])[0])/(statistic.lost[i])[1]
-					tmp = (i,int((statistic.lost[i])[0]),int((statistic.lost[i])[1]),float((statistic.lost[i])[2]))
-					lost_list.append(tmp)
-				print sorted(lost_list,cmp=lambda x,y:cmp(x[2],y[2]))
-						# if(x == len(feature_list)-1):
-						# 	outfile.write(data_value_dic[feature_list[x]]+'\n')
-						# else:
-			# print lost
-					# 	outfile.write(str(data_value_dic[feature_list[x]])+',') 
-					# outfile.write(result[3])
-						# sfh_dot=list()
-						# for x in xrange(0,10):
-						# 	#transform sfh to dot
-						# 	sfh_dot.append(lib.GIE_sfh_similiarity(result[19],len(result[19]),sfh_set[x],len(sfh_set[x])))
-						# if(len(data_set)==7):
-							# outfile.write(str(data_set[0])+','+str(data_set[1])+','+str(data_set[2])\
-							# +','+str(data_set[3])+','+str(data_set[4])+','+str(data_set[5])+','+result[5]\
-							# +','+result[7]+','+result[9]+','+result[11]+','+result[13]+','+result[15]+result[17]\
-							# +','+result[19]+'\n')
-
-# with open(ripe_file_address,'w') as outfile:
-# 	outfile.write(str(lost))
diff --git a/dataset_build/file_digest.py b/dataset_build/file_digest.py
deleted file mode 100644
index 590e059..0000000
--- a/dataset_build/file_digest.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#-*-coding:utf-8-*-
-import re
-import random
-import ConfigParser
-import bisect
-import commands
-import os
-import hashlib
-
-class data_line(object):
-	"""docstring for ClassName"""
-	def __init__(self):
-		super(ClassName, self).__init__()
-	
-	@staticmethod
-	def  if_error(data_line_str):
-		data_line_val = re.split(r';',data_line_str)
-		hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
-		if(term['data_num'](data_line_val) and \
-		term['not_null'](data_line_val[0]) and \
-		term['ysp_len'](data_line_val[3]) and \
-		term['not_null'](data_line_val[4]) and \
-		term['td_len'](data_line_val[6]) and \
-		term['td_len'](data_line_val[8]) and \
-		term['td_len'](data_line_val[10]) and \
-		term['td_len'](data_line_val[12]) and \
-		term['td_len'](data_line_val[14]) and \
-		term['td_len'](data_line_val[16]) and \
-		term['not_null'](data_line_val[18]) and \
-		term['sfh_len'](data_line_val[19]) and \
-		term['not_null'](data_line_val[20]) and \
-		hashed_len/float(data_line_val[3])>=0.8):
-			return data_line_val
-		else:
-			return -1
-
-class TD_fingerprint(object):
-		def __init__():
-			self.td = td
-			self.td_string = td_string
-		@staticmethod
-		def td_generate(td_string):
-			td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
-
-class sfh_fingerprint(object):
-	
-	def __init__(self,sfh):
-		self.sfh = sfh
-
-	@staticmethod
-	def get_hashed_len(sfh):
-		p = r"\[+\d+?:+\d+?\]"
-		pattern = re.compile(p)
-		hashed_len_set = pattern.findall(sfh)
-		if (term['not_null'](hashed_len_set)):
-			hashed_len = 0
-			for x in xrange(0,len(hashed_len_set)):
-				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
-				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
-			return hashed_len/len(hashed_len_set)
-		else :
-			return -1
-
-term = {'td_len':(lambda x : len(x)==32),
-		'data_num':(lambda x : len(x)==21),
-		'url':(lambda x : x.find['NUll']),
-		'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
-		'not_null':(lambda x : len(x)!=0),
-		'ysp_len':(lambda x : int(x)!=0),
-		'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} 
-
-grain="./get_lost"
-ripe_files=[]
-config = ConfigParser.RawConfigParser()
-config.read("grain.conf")
-raw_file_address=config.get("file","raw_file_address")
-ripe_files_address=config.get("file","ripe_files_address")
-print ("%s %s" %(raw_file_address,ripe_files_address))
-num = [0,0,0,0,0,0,0]
-breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
-# i=0
-# for i in xrange(0,ripe_file_num):
-# 	outfile=open(ripe_files_address+str(i)+'.txt','w')
-# 	ripe_files.append(outfile)
-
-i=0	
-with open(raw_file_address,'r') as infile:
-# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile:
-	with open(ripe_files_address,'w')as outfile:
-		for line in infile:
-			i+=1
-			if(i%10000==0):
-				print i
-			line_return = data_line.if_error(line)
-			if(line_return != -1):
-				outfile.write(str(line))
-\ No newline at end of file
diff --git a/dataset_build/get_lost.c b/dataset_build/get_lost.c
deleted file mode 100644
index 0e6c452..0000000
--- a/dataset_build/get_lost.c
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <MESA/MESA_htable.h>
-#include <assert.h>
-#include <ctype.h>
-#define HTABLE_SIZE 8*64*1024*1024
-#define SFH_PASS_RATE 0.8
-#define SIMILIAR 80
-
-typedef struct td
-{
-	char * tdstr;
-	unsigned int lost;
-}td;
-
-typedef struct file_sfh_data
-{
-	long id;
-	char * sfh;
-	td * td_value;
-	char * td_ori;
-}file_sfh_data;
-
-int main(int argc,char *argv[])
-{
-	FILE *fpread;//文件
-	FILE *fpwrite;//write file handle
-	int array_size = 1024;
-	file_sfh_data **file_data=(file_sfh_data **)malloc(sizeof(file_sfh_data)*array_size);
-	char* dirstr = "../../data/td_data_set/td_data_20171207/td_sfh_lost";
-	//char* dirstr = *++argv;
-	char* writestr = "../../data/td_data_set/td_data_20171207/td.txt";
-	int total_len = 0;
-	char TD_tmp[256], SFH_tmp[1024*300], TD_ORI[1024*10];
-	char buffer[1024*300+1];
-	int ret = 0;
-	int line = 0;
-	int thread_safe = 0;
-	int i;
-	int id;
-	int similiarity;
-	MESA_htable_handle htable = NULL;
-	fpread=fopen(dirstr,"rb");
-	fpwrite=fopen(writestr,"w");
-	printf("file str is %s\n",dirstr);
-	if(fpread==NULL)
-    {
-		printf("open file error\n");
-		return -1;
-	}
-	buffer[sizeof(buffer)]='\0';
-	while(feof(fpread)==0)
-	{
-		fgets(buffer,sizeof(buffer)-1,fpread);
-		ret=sscanf(buffer,"%d;%[^;];%[^;];%s",&total_len,TD_ORI,TD_tmp,SFH_tmp);
-		if(ret!=4)
-		{
-			continue;
-		}
-		file_data[line]=(file_sfh_data*)calloc(1,sizeof(file_sfh_data));
-		file_data[line]->id=line;
-		file_data[line]->sfh=strdup(SFH_tmp);
-		file_data[line]->td_value=(td*)calloc(1,sizeof(td));
-		file_data[line]->td_value->tdstr=strdup(TD_tmp);
-		file_data[line]->td_value->lost=0;
-		file_data[line]->td_ori=strdup(TD_ORI);
-		line++;
-		if(line==array_size)
-		{
-			array_size*=2;
-			file_data=realloc(file_data,sizeof(file_sfh_data)*array_size);
-		}
-	}
-	printf("read file success!\n");
-	htable = NULL;
-	htable=MESA_htable_born();
-	thread_safe = 0;
-	MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
-	unsigned int slot_size=1024*1024*16;
-	MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(slot_size));
-	MESA_htable_mature(htable);
-	for(i=0;i<line;i++)
-	{
-		if(MESA_htable_add(htable,(char*)(file_data[i]->td_value->tdstr),32,(void *)file_data[i]->id)<0)
-		{
-			id=(long)MESA_htable_search(htable,(char*)file_data[i]->td_value->tdstr,32);
-			similiarity=GIE_sfh_similiarity(file_data[id]->sfh,(int)strlen(file_data[id]->sfh),file_data[i]->sfh,(int)strlen(file_data[i]->sfh));
-			if(similiarity<SIMILIAR)
-			{
-				file_data[id]->td_value->lost = 1;
-				file_data[i]->td_value->lost = 1;
-			}
-		}
-	}
-	for(i=0;i<line;i++)
-	{
-		fprintf(fpwrite,"%s;%s;%s;%d\n",file_data[i]->td_value->tdstr,file_data[i]->sfh,file_data[i]->td_ori,file_data[i]->td_value->lost);
-	}
-	for(i=0;i<line;i++)
-	{
-		free(file_data[i]->sfh);
-		file_data[i]->sfh=NULL;
-		free(file_data[i]->td_value->tdstr);
-		file_data[i]->td_value->tdstr=NULL;
-		free(file_data[i]->td_value);
-		file_data[i]->td_value=NULL;
-		free(file_data[i]->td_ori);
-		file_data[i]->td_ori=NULL;
-		free(file_data[i]);
-		file_data[i]=NULL;
-	}
-	fclose(fpread);
-	fclose(fpwrite);
-	return 0;
-}
-\ No newline at end of file
diff --git a/dataset_build/grain.conf b/dataset_build/grain.conf
deleted file mode 100644
index 944b337..0000000
--- a/dataset_build/grain.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-[file]
-ripe_files_address = ../../data/td_data_set/td_data_20171207/get_lost_raw_data
-raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest
-[output]
-breakpoints = 1048576,1310720,1572864,1835008,2097152,3145728,4194304
-\ No newline at end of file
diff --git a/dataset_build/td_classification.py b/dataset_build/td_classification.py
deleted file mode 100644
index 8d4b97c..0000000
--- a/dataset_build/td_classification.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from sklearn.datasets import load_iris
-from sklearn import tree
-
-with open() as infile:
-	
-\ No newline at end of file
diff --git a/dataset_build/vedio_id_build.c b/dataset_build/vedio_id_build.c
deleted file mode 100644
index 9faaa64..0000000
--- a/dataset_build/vedio_id_build.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
-gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc 
-*/
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include "gram_index_engine.h"
-#include <MESA/MESA_htable.h>
-#include <assert.h>
-#include <ctype.h>
-#define BUFFER_LEN (10*1024)
-#define SFH_PASS_RATE 0.9
-#define SFH_LEN (10*1024)
-#define URL_LEN (10*1024)
-
-typedef struct video_id
-{
-	long id;
-	char *sfh;
-}video_id;
-
-typedef struct cache
-{
-	GIE_digest_t ** GIE_cache;
-	long cache_size;
-	long len;
-}cache;
-
-long get_hashed_len(const char* sfh)
-{
-	char *data=(char*)malloc(strlen(sfh)+1);
-	memcpy(data,sfh, strlen(sfh));
-	data[strlen(sfh)]='\0';
-	char *token=NULL,*sub_token=NULL,*saveptr;
-	long left_offset=0,right_offset=0,hashed_length=0;
-	int ret=0,first=0;
-	for (token = data; ;token= NULL)
-	{
-		sub_token= strtok_r(token,"[", &saveptr);
-		if (sub_token == NULL)
-		{
-			break;
-		}
-		if(first==0)//jump over the first sub string.
-		{
-			first=1;
-			continue;
-		}
-		ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
-		if(ret!=2)
-		{
-			return 0;
-		} 
-		assert(ret==2);
-		hashed_length+=right_offset-left_offset+1;
-	}
-	//printf("hashed length=%ld\n",hashed_length);
-	free(data);
-	return hashed_length/2;
-}
-
-int main(int argc,char *argv[])
-{
-	FILE *video_id_sets_file;
-	FILE *new_sfh_file;
-	const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data";
-	const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt";
-	char *buffer=NULL;
-	int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0;
-	int update = 0,video_id = 0,j = 0;
-	int* temp_int = NULL;
-	float temp_sfh_pass = 0;
-	char *sfh_str,*url_str;
-	GIE_digest_t *sfh_video_id = NULL; 
-	GIE_result_t *query_result = NULL;
-	cache *GIE_digest_cache = NULL;
-	video_id_sets_file = fopen(video_id_sets_file_dir,"r+");
-	new_sfh_file = fopen(new_sfh_file_dir,"w");
-	if(video_id_sets_file == NULL)
-	{
-		printf("open video_id_sets_file error\n");
-		return -1;
-	}
-	if(new_sfh_file == NULL)
-	{
-		printf("open new_sfh_file error\n");
-		return -1;
-	}
-	buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
-	GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t));
-	query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t));
-	GIE_handle_t *query_handle;
-	query_para->gram_value = 7;
-	query_para->position_accuracy = 5;
-	query_handle=GIE_create((const GIE_create_para_t *)query_para);
-	free(query_para);
-	if(query_handle==NULL)
-	{
-		printf("create GIE handle error\n");
-		return -1;
-	}
-	sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
-	sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
-	url_str = (char*)calloc(URL_LEN,sizeof(char));
-	i=0;
-	GIE_digest_cache =(cache*)calloc(1,sizeof(cache));
-	GIE_digest_cache->cache_size = 1000;
-	GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*));
-	GIE_digest_cache->len = 0;
-	while(feof(video_id_sets_file)==0)
-	{
-		i++;
-		if(i%10000==0)
-		{
-			printf("%d\n",i);
-		}
-		fgets(buffer,BUFFER_LEN-1,video_id_sets_file);
-		ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
-			%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
-			%*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str);
-		if(ret!=2)
-		{
-			continue;
-		}
-		hashed_len = get_hashed_len((const char*)sfh_str);
-		temp_sfh_pass = (float)hashed_len/total_len;
-		if(temp_sfh_pass<SFH_PASS_RATE)
-		{
-			continue;
-		}
-		resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1);
-		if(resultnum == 0)
-		{
-			temp_int=(int*)calloc(1,sizeof(int));
-			*temp_int=i;
-			sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
-			sfh_video_id->id=i;
-			sfh_video_id->sfh_length=strlen(sfh_str);
-			sfh_video_id->operation=GIE_INSERT_OPT;
-			sfh_video_id->cfds_lvl=5;
-			sfh_video_id->sfh=strdup(sfh_str);
-			sfh_video_id->tag=temp_int;
-			GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id;
-			GIE_digest_cache->len++;
-			if(GIE_digest_cache->len==GIE_digest_cache->cache_size)
-			{
-				update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size);
-				GIE_digest_cache->len=0;
-				for(j=0;j<GIE_digest_cache->cache_size;j++)
-				{
-					free(GIE_digest_cache->GIE_cache[j]->sfh);
-					GIE_digest_cache->GIE_cache[j]->sfh=NULL;
-					free(GIE_digest_cache->GIE_cache[j]);
-					GIE_digest_cache->GIE_cache[j]=NULL;
-				}
-			}
-			fprintf(new_sfh_file,"%d,%s",i,buffer);
-		}
-		else
-		{
-			fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer);
-		}
-	}
-	free(buffer);
-	free(query_result);
-	free(sfh_video_id);
-	free(url_str);
-	free(sfh_str);
-	free(GIE_digest_cache);
-	return 0;
-}
-\ No newline at end of file
author	陈冠林 <[email protected]>	2019-06-18 10:44:20 +0800
committer	陈冠林 <[email protected]>	2019-06-18 10:44:20 +0800
commit	b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
tree	b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /dataset_build
parent	b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)