import ConfigParser import re config = ConfigParser.RawConfigParser() config.read("find_lost_td.conf") raw_survey_file_13 = config.get("file","raw_survey_file_13") raw_deup_file = config.get("file","raw_deup_file") # run_time_file = config.get("file","run_time_file") raw_survey_file_12 = config.get("file","raw_survey_file_12") # lost_td_line = config.get("file","lost_td_line") mid_12_file = config.get("file","mid_12_file") mid_13_file = config.get("file","mid_13_file") list_12_file = config.get("file","list_12_file") list_13_file = config.get("file","list_13_file") different_mid_file_13 = config.get("file","different_mid_file_13") different_mid_file_12 = config.get("file","different_mid_file_12") different_list_file_13 = config.get("file","different_list_file_13") different_list_file_12 = config.get("file","different_list_file_12") term = {'td_len':(lambda x : len(x)==32), 'data_num':(lambda x : len(x)>7), 'url':(lambda x : x.find['NUll']), 'sfh_len':(lambda x : len(x)>20), 'not_null':(lambda x : len(x)!=0)} mid_13=dict() with open(raw_survey_file_13,'r') as infile: for line in infile: data_line_val = re.split(r',',line) if(len(data_line_val)==8): mid_string = (re.split(r"MID:",data_line_val[2]))[1] mid_13[mid_string]=list() with open(mid_13_file,'w') as outfile: for key in mid_13: outfile.write(key+'\n') mid_12=dict() with open(raw_survey_file_12,'r') as infile: for line in infile: data_line_val = re.split(r',',line) if(len(data_line_val)==8): mid_string = (re.split(r"MID:",data_line_val[2]))[1] mid_12[mid_string]=list() with open(mid_12_file,'w') as outfile: for key in mid_12: outfile.write(key+'\n') different_mid_13 = list() with open(different_mid_file_13,'w') as outfile: for key in mid_13: if(mid_12.has_key(key)==False): different_mid_13.append(key) outfile.write(key+'\n') different_mid_12 = list() with open(different_mid_file_12,'w') as outfile: for key in mid_12: if(mid_13.has_key(key)==False): different_mid_12.append(key) outfile.write(key+'\n') i=0 with open(raw_deup_file,'r') as infile: for line in infile: i+=1 if(i%100000==0): print i data_line_val = re.split(r",|MID:|TD:",line) if(term['data_num'](data_line_val) and \ mid_13.has_key(str(data_line_val[4])) == True): mid_13[data_line_val[4]].append(data_line_val[6]) if(term['data_num'](data_line_val) and \ mid_12.has_key(str(data_line_val[4])) == True): mid_12[data_line_val[4]].append(data_line_val[6]) td_list_13 =list() with open(list_13_file,'w') as outfile: for key in mid_13.keys(): for td in mid_13[key]: if(term['not_null'](td) and td_list_13.count(td)==0): td_list_13.append(td) outfile.write(td+'\n') td_list_12 =list() with open(list_12_file,'w') as outfile: for key in mid_12.keys(): for td in mid_12[key]: if(term['not_null'](td) and td_list_12.count(td)==0): td_list_12.append(td) outfile.write(td+'\n') different_list_12 = list() with open(different_list_file_12,'w') as outfile: for x in td_list_12: if(td_list_13.count(x)==0): different_list_12.append(x) outfile.write(x+'\n') different_list_13 = list() with open(different_list_file_13,'w') as outfile: for x in td_list_13: if(td_list_12.count(x)==0): different_list_13.append(x) outfile.write(x+'\n') td_dict=dict() for i in different_list_12: td_dict[i]=list() # i=0 # with open(run_time_file,'r') as infile: # for line in infile: # i+=1 # if(i%100000==0): # print i # if(line.find("NCHK_QUREY__KNOW")!=-1): # data_line_val = re.split(r',|TD:',line) # if(td_dict.has_key(data_line_val[6]) == True): # td_dict[data_line_val[6]].insert(0,"NCHK_QUREY__KNOW"+'\n') # td_dict[data_line_val[6]].append(line) # elif(line.find("NCHK_QUREY__UNKNOW")!=-1): # data_line_val = re.split(r',|TD:',line) # if(td_dict.has_key(data_line_val[6]) == True): # td_dict[data_line_val[6]].append(line) # elif(line.find("NCHK_REPORT__SUCC")!=-1): # data_line_val = re.split(r',|TD:',line) # if(td_dict.has_key(data_line_val[6]) == True): # td_dict[data_line_val[6]].append(line) # else: # continue print len(different_list_12),len(different_list_13),\ len(td_list_12),len(td_list_13),\ len(mid_12),len(mid_13),len(different_mid_13),len(different_mid_12) # with open(lost_td_line,'w') as outfile: # for key in td_dict.keys(): # if(len(td_dict[key])>2 and td_dict[key][0]=="NCHK_QUREY__KNOW"): # outfile.write(key+':\n') # for i in td_dict[key]: # outfile.write(i)