summaryrefslogtreecommitdiff
path: root/src/dataset_build/based_sfh.py
blob: b3281ce789d30ee0e1e1091c8579c60728037042 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
import ConfigParser
import bisect
import random

term = {'not_null':(lambda x : len(x)!=0)}
	
config = ConfigParser.RawConfigParser()
config.read("based_sfh.conf")
raw_file_address = config.get("file","raw_file_address")
ripe_file_address = config.get("file","ripe_file_address")

class sfh_fingerprint(object):
	
	def __init__(self,sfh):
		self.sfh = sfh

	@staticmethod
	def get_hashed_len(sfh):
		p = r"\[+\d+?:+\d+?\]"
		pattern = re.compile(p)
		hashed_len_set = pattern.findall(sfh)
		if (term['not_null'](hashed_len_set)):
			hashed_len = 0
			for x in xrange(0,len(hashed_len_set)):
				hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
				hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
			return hashed_len/len(hashed_len_set)
		else :
			return -1

i=0
sfh_set = list()
with open(raw_file_address,'r') as infile:
	with open(ripe_file_address,'w') as outfile: 
		for line in infile:
			i+=1
			if(i%100000==0):
				print i
			result = re.split(r';',line)
			if(term['not_null'](result[3]) and term['not_null'](result[19])):
				hashed_len = sfh_fingerprint.get_hashed_len(result[19])
				if(hashed_len/int(result[3])>0.8):
					outfile.write(result[19]+'\n')