#-*-coding:utf-8-*- import re import random import ConfigParser import bisect import commands import os import hashlib class data_line(object): """docstring for ClassName""" def __init__(self): super(ClassName, self).__init__() @staticmethod def if_error(data_line_str): data_line_val = re.split(r';',data_line_str) hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) if(term['data_num'](data_line_val) and \ term['not_null'](data_line_val[0]) and \ term['not_null'](data_line_val[1]) and \ term['not_null'](data_line_val[2]) and \ term['ysp_len'](data_line_val[3]) and \ term['not_null'](data_line_val[4]) and \ term['not_null'](data_line_val[5]) and \ term['td_len'](data_line_val[6]) and \ term['td_len'](data_line_val[7]) and \ term['td_len'](data_line_val[8]) and \ term['td_len'](data_line_val[9]) and \ term['td_len'](data_line_val[10]) and \ term['td_len'](data_line_val[11]) and \ term['td_len'](data_line_val[12]) and \ term['td_len'](data_line_val[13]) and \ term['td_len'](data_line_val[14]) and \ term['td_len'](data_line_val[15]) and \ term['td_len'](data_line_val[16]) and \ term['td_len'](data_line_val[17]) and \ term['not_null'](data_line_val[18]) and \ term['sfh_len'](data_line_val[19]) and \ term['not_null'](data_line_val[20]) and \ hashed_len/float(data_line_val[3])>0.999): return data_line_val else: return -1 class TD_fingerprint(object): def __init__(): self.td = td self.td_string = td_string @staticmethod def td_generate(td_string): td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest() class sfh_fingerprint(object): def __init__(self,sfh): self.sfh = sfh @staticmethod def get_hashed_len(sfh): p = r"\[+\d+?:+\d+?\]" pattern = re.compile(p) hashed_len_set = pattern.findall(sfh) if (term['not_null'](hashed_len_set)): hashed_len = 0 for x in xrange(0,len(hashed_len_set)): hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) return hashed_len/len(hashed_len_set) else : return -1 term = {'td_len':(lambda x : len(x)==32), 'data_num':(lambda x : len(x)==21), 'url':(lambda x : x.find['NUll']), 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), 'not_null':(lambda x : len(x)!=0), 'ysp_len':(lambda x : int(x)!=0), 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} c_func="./" ripe_files=[] config = ConfigParser.RawConfigParser() config.read("file_digest.conf") raw_file_address=config.get("file","raw_file_address") ripe_files_address=config.get("file","ripe_files_address") print ("%s %s" %(raw_file_address,ripe_files_address)) # num = [0,0,0,0,0,0,0] # breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] # i=0 # for i in xrange(0,ripe_file_num): # outfile=open(ripe_files_address+str(i)+'.txt','w') # ripe_files.append(outfile) i=0 with open(raw_file_address,'r') as infile: with open(ripe_files_address,'w')as outfile: for line in infile: i+=1 if(i%10000==0): print i line_return = data_line.if_error(line) if(line_return != -1): outfile.write(str(line))