summaryrefslogtreecommitdiff
path: root/src/dataset_build/based_sfh.py
diff options
context:
space:
mode:
author陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
committer陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
commitb2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
treeb7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/based_sfh.py
parentb026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
添加inc和srcHEADmaster
Diffstat (limited to 'src/dataset_build/based_sfh.py')
-rw-r--r--src/dataset_build/based_sfh.py44
1 files changed, 44 insertions, 0 deletions
diff --git a/src/dataset_build/based_sfh.py b/src/dataset_build/based_sfh.py
new file mode 100644
index 0000000..b3281ce
--- /dev/null
+++ b/src/dataset_build/based_sfh.py
@@ -0,0 +1,44 @@
+import re
+import ConfigParser
+import bisect
+import random
+
+term = {'not_null':(lambda x : len(x)!=0)}
+
+config = ConfigParser.RawConfigParser()
+config.read("based_sfh.conf")
+raw_file_address = config.get("file","raw_file_address")
+ripe_file_address = config.get("file","ripe_file_address")
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+i=0
+sfh_set = list()
+with open(raw_file_address,'r') as infile:
+ with open(ripe_file_address,'w') as outfile:
+ for line in infile:
+ i+=1
+ if(i%100000==0):
+ print i
+ result = re.split(r';',line)
+ if(term['not_null'](result[3]) and term['not_null'](result[19])):
+ hashed_len = sfh_fingerprint.get_hashed_len(result[19])
+ if(hashed_len/int(result[3])>0.8):
+ outfile.write(result[19]+'\n') \ No newline at end of file