diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/file_digest.py | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/file_digest.py')
| -rw-r--r-- | src/file_digest.py | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/src/file_digest.py b/src/file_digest.py new file mode 100644 index 0000000..3703794 --- /dev/null +++ b/src/file_digest.py @@ -0,0 +1,104 @@ +#-*-coding:utf-8-*- +import re +import random +import ConfigParser +import bisect +import commands +import os +import hashlib + +class data_line(object): + """docstring for ClassName""" + def __init__(self): + super(ClassName, self).__init__() + + @staticmethod + def if_error(data_line_str): + data_line_val = re.split(r';',data_line_str) + hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) + if(term['data_num'](data_line_val) and \ + term['not_null'](data_line_val[0]) and \ + term['not_null'](data_line_val[1]) and \ + term['not_null'](data_line_val[2]) and \ + term['ysp_len'](data_line_val[3]) and \ + term['not_null'](data_line_val[4]) and \ + term['not_null'](data_line_val[5]) and \ + term['td_len'](data_line_val[6]) and \ + term['td_len'](data_line_val[7]) and \ + term['td_len'](data_line_val[8]) and \ + term['td_len'](data_line_val[9]) and \ + term['td_len'](data_line_val[10]) and \ + term['td_len'](data_line_val[11]) and \ + term['td_len'](data_line_val[12]) and \ + term['td_len'](data_line_val[13]) and \ + term['td_len'](data_line_val[14]) and \ + term['td_len'](data_line_val[15]) and \ + term['td_len'](data_line_val[16]) and \ + term['td_len'](data_line_val[17]) and \ + term['not_null'](data_line_val[18]) and \ + term['sfh_len'](data_line_val[19]) and \ + term['not_null'](data_line_val[20]) and \ + hashed_len/float(data_line_val[3])>0.999): + return data_line_val + else: + return -1 + +class TD_fingerprint(object): + def __init__(): + self.td = td + self.td_string = td_string + @staticmethod + def td_generate(td_string): + td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest() + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==21), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), + 'not_null':(lambda x : len(x)!=0), + 'ysp_len':(lambda x : int(x)!=0), + 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} + +c_func="./" +ripe_files=[] +config = ConfigParser.RawConfigParser() +config.read("file_digest.conf") +raw_file_address=config.get("file","raw_file_address") +ripe_files_address=config.get("file","ripe_files_address") +print ("%s %s" %(raw_file_address,ripe_files_address)) +# num = [0,0,0,0,0,0,0] +# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +# i=0 +# for i in xrange(0,ripe_file_num): +# outfile=open(ripe_files_address+str(i)+'.txt','w') +# ripe_files.append(outfile) + +i=0 +with open(raw_file_address,'r') as infile: + with open(ripe_files_address,'w')as outfile: + for line in infile: + i+=1 + if(i%10000==0): + print i + line_return = data_line.if_error(line) + if(line_return != -1): + outfile.write(str(line))
\ No newline at end of file |
