src/file_digest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

#-*-coding:utf-8-*-
import re
import random
import ConfigParser
import bisect
import commands
import os
import hashlib

class data_line(object):
	"""docstring for ClassName"""
	def __init__(self):
		super(ClassName, self).__init__()
	
	@staticmethod
	def  if_error(data_line_str):
		data_line_val = re.split(r';',data_line_str)
		hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
		if(term['data_num'](data_line_val) and \
		term['not_null'](data_line_val[0]) and \
		term['not_null'](data_line_val[1]) and \
		term['not_null'](data_line_val[2]) and \
		term['ysp_len'](data_line_val[3]) and \
		term['not_null'](data_line_val[4]) and \
		term['not_null'](data_line_val[5]) and \
		term['td_len'](data_line_val[6]) and \
		term['td_len'](data_line_val[7]) and \
		term['td_len'](data_line_val[8]) and \
		term['td_len'](data_line_val[9]) and \
		term['td_len'](data_line_val[10]) and \
		term['td_len'](data_line_val[11]) and \
		term['td_len'](data_line_val[12]) and \
		term['td_len'](data_line_val[13]) and \
		term['td_len'](data_line_val[14]) and \
		term['td_len'](data_line_val[15]) and \
		term['td_len'](data_line_val[16]) and \
		term['td_len'](data_line_val[17]) and \
		term['not_null'](data_line_val[18]) and \
		term['sfh_len'](data_line_val[19]) and \
		term['not_null'](data_line_val[20]) and \
		hashed_len/float(data_line_val[3])>0.999):
			return data_line_val
		else:
			return -1

class TD_fingerprint(object):
		def __init__():
			self.td = td
			self.td_string = td_string
		@staticmethod
		def td_generate(td_string):
			td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()

class sfh_fingerprint(object):
	
	def __init__(self,sfh):
		self.sfh = sfh

	@staticmethod
	def get_hashed_len(sfh):
		p = r"\[+\d+?:+\d+?\]"
		pattern = re.compile(p)
		hashed_len_set = pattern.findall(sfh)
		if (term['not_null'](hashed_len_set)):
			hashed_len = 0
			for x in xrange(0,len(hashed_len_set)):
				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
			return hashed_len/len(hashed_len_set)
		else :
			return -1

term = {'td_len':(lambda x : len(x)==32),
		'data_num':(lambda x : len(x)==21),
		'url':(lambda x : x.find['NUll']),
		'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
		'not_null':(lambda x : len(x)!=0),
		'ysp_len':(lambda x : int(x)!=0),
		'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} 

c_func="./"
ripe_files=[]
config = ConfigParser.RawConfigParser()
config.read("file_digest.conf")
raw_file_address=config.get("file","raw_file_address")
ripe_files_address=config.get("file","ripe_files_address")
print ("%s %s" %(raw_file_address,ripe_files_address))
# num = [0,0,0,0,0,0,0]
# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
# i=0
# for i in xrange(0,ripe_file_num):
# 	outfile=open(ripe_files_address+str(i)+'.txt','w')
# 	ripe_files.append(outfile)

i=0	
with open(raw_file_address,'r') as infile:
	with open(ripe_files_address,'w')as outfile:
		for line in infile:
			i+=1
			if(i%10000==0):
				print i
			line_return = data_line.if_error(line)
			if(line_return != -1):
				outfile.write(str(line))