1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#-*-coding:utf-8-*-
import re
import random
import ConfigParser
import bisect
import commands
import os
import hashlib
class data_line(object):
"""docstring for ClassName"""
def __init__(self):
super(ClassName, self).__init__()
@staticmethod
def if_error(data_line_str):
data_line_val = re.split(r';',data_line_str)
hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
if(term['data_num'](data_line_val) and \
term['not_null'](data_line_val[0]) and \
term['not_null'](data_line_val[1]) and \
term['not_null'](data_line_val[2]) and \
term['ysp_len'](data_line_val[3]) and \
term['not_null'](data_line_val[4]) and \
term['not_null'](data_line_val[5]) and \
term['td_len'](data_line_val[6]) and \
term['td_len'](data_line_val[7]) and \
term['td_len'](data_line_val[8]) and \
term['td_len'](data_line_val[9]) and \
term['td_len'](data_line_val[10]) and \
term['td_len'](data_line_val[11]) and \
term['td_len'](data_line_val[12]) and \
term['td_len'](data_line_val[13]) and \
term['td_len'](data_line_val[14]) and \
term['td_len'](data_line_val[15]) and \
term['td_len'](data_line_val[16]) and \
term['td_len'](data_line_val[17]) and \
term['not_null'](data_line_val[18]) and \
term['sfh_len'](data_line_val[19]) and \
term['not_null'](data_line_val[20]) and \
hashed_len/float(data_line_val[3])>0.999):
return data_line_val
else:
return -1
class TD_fingerprint(object):
def __init__():
self.td = td
self.td_string = td_string
@staticmethod
def td_generate(td_string):
td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
class sfh_fingerprint(object):
def __init__(self,sfh):
self.sfh = sfh
@staticmethod
def get_hashed_len(sfh):
p = r"\[+\d+?:+\d+?\]"
pattern = re.compile(p)
hashed_len_set = pattern.findall(sfh)
if (term['not_null'](hashed_len_set)):
hashed_len = 0
for x in xrange(0,len(hashed_len_set)):
hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
return hashed_len/len(hashed_len_set)
else :
return -1
term = {'td_len':(lambda x : len(x)==32),
'data_num':(lambda x : len(x)==21),
'url':(lambda x : x.find['NUll']),
'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
'not_null':(lambda x : len(x)!=0),
'ysp_len':(lambda x : int(x)!=0),
'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
c_func="./"
ripe_files=[]
config = ConfigParser.RawConfigParser()
config.read("file_digest.conf")
raw_file_address=config.get("file","raw_file_address")
ripe_files_address=config.get("file","ripe_files_address")
print ("%s %s" %(raw_file_address,ripe_files_address))
# num = [0,0,0,0,0,0,0]
# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
# i=0
# for i in xrange(0,ripe_file_num):
# outfile=open(ripe_files_address+str(i)+'.txt','w')
# ripe_files.append(outfile)
i=0
with open(raw_file_address,'r') as infile:
with open(ripe_files_address,'w')as outfile:
for line in infile:
i+=1
if(i%10000==0):
print i
line_return = data_line.if_error(line)
if(line_return != -1):
outfile.write(str(line))
|