summaryrefslogtreecommitdiff
path: root/src/file_digest.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/file_digest.py')
-rw-r--r--src/file_digest.py104
1 files changed, 104 insertions, 0 deletions
diff --git a/src/file_digest.py b/src/file_digest.py
new file mode 100644
index 0000000..3703794
--- /dev/null
+++ b/src/file_digest.py
@@ -0,0 +1,104 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and \
+ term['not_null'](data_line_val[0]) and \
+ term['not_null'](data_line_val[1]) and \
+ term['not_null'](data_line_val[2]) and \
+ term['ysp_len'](data_line_val[3]) and \
+ term['not_null'](data_line_val[4]) and \
+ term['not_null'](data_line_val[5]) and \
+ term['td_len'](data_line_val[6]) and \
+ term['td_len'](data_line_val[7]) and \
+ term['td_len'](data_line_val[8]) and \
+ term['td_len'](data_line_val[9]) and \
+ term['td_len'](data_line_val[10]) and \
+ term['td_len'](data_line_val[11]) and \
+ term['td_len'](data_line_val[12]) and \
+ term['td_len'](data_line_val[13]) and \
+ term['td_len'](data_line_val[14]) and \
+ term['td_len'](data_line_val[15]) and \
+ term['td_len'](data_line_val[16]) and \
+ term['td_len'](data_line_val[17]) and \
+ term['not_null'](data_line_val[18]) and \
+ term['sfh_len'](data_line_val[19]) and \
+ term['not_null'](data_line_val[20]) and \
+ hashed_len/float(data_line_val[3])>0.999):
+ return data_line_val
+ else:
+ return -1
+
+class TD_fingerprint(object):
+ def __init__():
+ self.td = td
+ self.td_string = td_string
+ @staticmethod
+ def td_generate(td_string):
+ td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+c_func="./"
+ripe_files=[]
+config = ConfigParser.RawConfigParser()
+config.read("file_digest.conf")
+raw_file_address=config.get("file","raw_file_address")
+ripe_files_address=config.get("file","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+# num = [0,0,0,0,0,0,0]
+# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+# i=0
+# for i in xrange(0,ripe_file_num):
+# outfile=open(ripe_files_address+str(i)+'.txt','w')
+# ripe_files.append(outfile)
+
+i=0
+with open(raw_file_address,'r') as infile:
+ with open(ripe_files_address,'w')as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ outfile.write(str(line)) \ No newline at end of file