import re import ConfigParser import bisect import random import ctypes import hashlib import zlib import binascii import json import datetime import time term = {'td_len':(lambda x : len(x)==32), 'data_num':(lambda x : len(x)==21), 'url':(lambda x : x.find['NUll']), 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), 'not_null':(lambda x : len(x)!=0), 'ysp_len':(lambda x : int(x)!=0), 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} config = ConfigParser.RawConfigParser() config.read("dataset_build.conf") raw_file_address = config.get("file","raw_file_address") ripe_file_address = config.get("file","ripe_file_address") base_sfh_sets = config.get("file","base_sfh_sets") breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] feature_list =[i for i in config.get("feature","feature_name").split(",")] ll=ctypes.cdll.LoadLibrary lib = ll("libmaatframe.so") lost = dict() class data_value(object): @staticmethod def get_feature(data): return_data=list() data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data[5]) for x in xrange(1,21): if(x==1): if(term['not_null'](data_set[6])): try: time1=datetime.datetime.strptime(data[1],'%Y-%m-%d %H:%M:%S')+datetime.timedelta(hours=int(8)) data_set[6]=data_set[6][0:25] time2=datetime.datetime.strptime(data_set[6],'%a, %d %b %Y %H:%M:%S') except Exception, e: return_data.append(-1) return_data.append(-1) return_data.append(-1) return_data.append(-1) else: return_data.append(str((time1-time2).seconds)) return_data.append(((time1-time2).seconds)/60) return_data.append(((time1-time2).seconds)/3600) return_data.append((time1-time2).days) else: return_data.append(-1) return_data.append(-1) return_data.append(-1) return_data.append(-1) elif(x==2): continue elif(x==3): continue elif(x==4): return_data.append(long(data[4])) elif(x==5): if(term['not_null'](data_set[1])): return_data.append(len(data_set[1])) else: return_data.append(-1) if(term['not_null'](data_set[2])): ip_set=re.split(r'\.',data_set[2]) return_data.append(ip_set[0]) return_data.append(ip_set[1]) return_data.append(ip_set[2]) return_data.append(ip_set[3]) else: return_data.append(-1) return_data.append(-1) return_data.append(-1) return_data.append(-1) if(term['not_null'](data_set[3])): return_data.append(int(data_set[3])) else: return_data.append(-1) if(term['not_null'](data_set[5])): return_data.append(binascii.crc32(data_set[5])) else: return_data.append(-1) if(term['not_null'](data_set[6])): return_data.append(binascii.crc32(data_set[6])) else: return_data.append(-1) elif(x==7): return_data.append(binascii.crc32(data[7])) elif(x==9): return_data.append(binascii.crc32(data[9])) elif(x==11): return_data.append(binascii.crc32(data[11])) elif(x==13): return_data.append(binascii.crc32(data[13])) elif(x==15): return_data.append(binascii.crc32(data[15])) elif(x==17): return_data.append(binascii.crc32(data[17])) return return_data # data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data) # #data_set[0]=null,data_set[1]=url # data_value_dic = dict() # for x in xrange(1,len(feature_list)+1): # if(x==1): # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) # elif(x==2): # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) # elif(x==3): # data_value_dic[feature_list[x-1]] = data_set[x] # elif(x==4): # data_value_dic[feature_list[x-1]] = data_set[x] # elif(x==5): # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) # elif(x==6): # data_value_dic[feature_list[x-1]] = binascii.crc32(data_set[x]) # return data_value_dic i=0 sfh_set = list() with open(raw_file_address,'r') as infile: with open(ripe_file_address,'w') as outfile: for line in infile: i+=1 if(i%10000==0): print i line_return = re.split(r';',line) # if(int(line_return[0])==0): # print 'td is right' outfile.write(str(line_return[0])+',') return_data=data_value.get_feature(line_return) for x in range(19): if(x==18): outfile.write(str(return_data[18])+'\n') else: outfile.write(str(return_data[x])+',')