添加inc和srcHEAD master

author: 陈冠林 <[email protected]> 2019-06-18 10:44:20 +0800
committer: 陈冠林 <[email protected]> 2019-06-18 10:44:20 +0800
commit: b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
tree: b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/file_digest.py
parent: b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
1 files changed, 96 insertions, 0 deletions
diff --git a/src/dataset_build/file_digest.py b/src/dataset_build/file_digest.py
new file mode 100644
index 0000000..590e059
--- /dev/null
+++ b/src/dataset_build/file_digest.py
@@ -0,0 +1,96 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+class data_line(object):
+	"""docstring for ClassName"""
+	def __init__(self):
+		super(ClassName, self).__init__()
+	
+	@staticmethod
+	def  if_error(data_line_str):
+		data_line_val = re.split(r';',data_line_str)
+		hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+		if(term['data_num'](data_line_val) and \
+		term['not_null'](data_line_val[0]) and \
+		term['ysp_len'](data_line_val[3]) and \
+		term['not_null'](data_line_val[4]) and \
+		term['td_len'](data_line_val[6]) and \
+		term['td_len'](data_line_val[8]) and \
+		term['td_len'](data_line_val[10]) and \
+		term['td_len'](data_line_val[12]) and \
+		term['td_len'](data_line_val[14]) and \
+		term['td_len'](data_line_val[16]) and \
+		term['not_null'](data_line_val[18]) and \
+		term['sfh_len'](data_line_val[19]) and \
+		term['not_null'](data_line_val[20]) and \
+		hashed_len/float(data_line_val[3])>=0.8):
+			return data_line_val
+		else:
+			return -1
+
+class TD_fingerprint(object):
+		def __init__():
+			self.td = td
+			self.td_string = td_string
+		@staticmethod
+		def td_generate(td_string):
+			td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
+
+class sfh_fingerprint(object):
+	
+	def __init__(self,sfh):
+		self.sfh = sfh
+
+	@staticmethod
+	def get_hashed_len(sfh):
+		p = r"\[+\d+?:+\d+?\]"
+		pattern = re.compile(p)
+		hashed_len_set = pattern.findall(sfh)
+		if (term['not_null'](hashed_len_set)):
+			hashed_len = 0
+			for x in xrange(0,len(hashed_len_set)):
+				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+			return hashed_len/len(hashed_len_set)
+		else :
+			return -1
+
+term = {'td_len':(lambda x : len(x)==32),
+		'data_num':(lambda x : len(x)==21),
+		'url':(lambda x : x.find['NUll']),
+		'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+		'not_null':(lambda x : len(x)!=0),
+		'ysp_len':(lambda x : int(x)!=0),
+		'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} 
+
+grain="./get_lost"
+ripe_files=[]
+config = ConfigParser.RawConfigParser()
+config.read("grain.conf")
+raw_file_address=config.get("file","raw_file_address")
+ripe_files_address=config.get("file","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+num = [0,0,0,0,0,0,0]
+breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+# i=0
+# for i in xrange(0,ripe_file_num):
+# 	outfile=open(ripe_files_address+str(i)+'.txt','w')
+# 	ripe_files.append(outfile)
+
+i=0	
+with open(raw_file_address,'r') as infile:
+# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile:
+	with open(ripe_files_address,'w')as outfile:
+		for line in infile:
+			i+=1
+			if(i%10000==0):
+				print i
+			line_return = data_line.if_error(line)
+			if(line_return != -1):
+				outfile.write(str(line))
+\ No newline at end of file
author	陈冠林 <[email protected]>	2019-06-18 10:44:20 +0800
committer	陈冠林 <[email protected]>	2019-06-18 10:44:20 +0800
commit	b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
tree	b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/file_digest.py
parent	b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)