summaryrefslogtreecommitdiff
path: root/src/dataset_build/file_digest.py
diff options
context:
space:
mode:
author陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
committer陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
commitb2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
treeb7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/file_digest.py
parentb026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
添加inc和srcHEADmaster
Diffstat (limited to 'src/dataset_build/file_digest.py')
-rw-r--r--src/dataset_build/file_digest.py96
1 files changed, 96 insertions, 0 deletions
diff --git a/src/dataset_build/file_digest.py b/src/dataset_build/file_digest.py
new file mode 100644
index 0000000..590e059
--- /dev/null
+++ b/src/dataset_build/file_digest.py
@@ -0,0 +1,96 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and \
+ term['not_null'](data_line_val[0]) and \
+ term['ysp_len'](data_line_val[3]) and \
+ term['not_null'](data_line_val[4]) and \
+ term['td_len'](data_line_val[6]) and \
+ term['td_len'](data_line_val[8]) and \
+ term['td_len'](data_line_val[10]) and \
+ term['td_len'](data_line_val[12]) and \
+ term['td_len'](data_line_val[14]) and \
+ term['td_len'](data_line_val[16]) and \
+ term['not_null'](data_line_val[18]) and \
+ term['sfh_len'](data_line_val[19]) and \
+ term['not_null'](data_line_val[20]) and \
+ hashed_len/float(data_line_val[3])>=0.8):
+ return data_line_val
+ else:
+ return -1
+
+class TD_fingerprint(object):
+ def __init__():
+ self.td = td
+ self.td_string = td_string
+ @staticmethod
+ def td_generate(td_string):
+ td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+grain="./get_lost"
+ripe_files=[]
+config = ConfigParser.RawConfigParser()
+config.read("grain.conf")
+raw_file_address=config.get("file","raw_file_address")
+ripe_files_address=config.get("file","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+num = [0,0,0,0,0,0,0]
+breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+# i=0
+# for i in xrange(0,ripe_file_num):
+# outfile=open(ripe_files_address+str(i)+'.txt','w')
+# ripe_files.append(outfile)
+
+i=0
+with open(raw_file_address,'r') as infile:
+# with open('./ripe_data/mistake_td_sfh1_sfh2_sim_rate_len_url_unequal','r')as infile:
+ with open(ripe_files_address,'w')as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ outfile.write(str(line)) \ No newline at end of file