diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/based_sfh.py | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/dataset_build/based_sfh.py')
| -rw-r--r-- | src/dataset_build/based_sfh.py | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/src/dataset_build/based_sfh.py b/src/dataset_build/based_sfh.py new file mode 100644 index 0000000..b3281ce --- /dev/null +++ b/src/dataset_build/based_sfh.py @@ -0,0 +1,44 @@ +import re +import ConfigParser +import bisect +import random + +term = {'not_null':(lambda x : len(x)!=0)} + +config = ConfigParser.RawConfigParser() +config.read("based_sfh.conf") +raw_file_address = config.get("file","raw_file_address") +ripe_file_address = config.get("file","ripe_file_address") + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + +i=0 +sfh_set = list() +with open(raw_file_address,'r') as infile: + with open(ripe_file_address,'w') as outfile: + for line in infile: + i+=1 + if(i%100000==0): + print i + result = re.split(r';',line) + if(term['not_null'](result[3]) and term['not_null'](result[19])): + hashed_len = sfh_fingerprint.get_hashed_len(result[19]) + if(hashed_len/int(result[3])>0.8): + outfile.write(result[19]+'\n')
\ No newline at end of file |
