diff options
Diffstat (limited to 'src/rssb_statistics/find_lost_td.py')
| -rw-r--r-- | src/rssb_statistics/find_lost_td.py | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/src/rssb_statistics/find_lost_td.py b/src/rssb_statistics/find_lost_td.py new file mode 100644 index 0000000..50f3fab --- /dev/null +++ b/src/rssb_statistics/find_lost_td.py @@ -0,0 +1,147 @@ +import ConfigParser +import re + +config = ConfigParser.RawConfigParser() +config.read("find_lost_td.conf") +raw_survey_file_13 = config.get("file","raw_survey_file_13") +raw_deup_file = config.get("file","raw_deup_file") +# run_time_file = config.get("file","run_time_file") +raw_survey_file_12 = config.get("file","raw_survey_file_12") +# lost_td_line = config.get("file","lost_td_line") +mid_12_file = config.get("file","mid_12_file") +mid_13_file = config.get("file","mid_13_file") +list_12_file = config.get("file","list_12_file") +list_13_file = config.get("file","list_13_file") +different_mid_file_13 = config.get("file","different_mid_file_13") +different_mid_file_12 = config.get("file","different_mid_file_12") +different_list_file_13 = config.get("file","different_list_file_13") +different_list_file_12 = config.get("file","different_list_file_12") + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)>7), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20), + 'not_null':(lambda x : len(x)!=0)} + +mid_13=dict() +with open(raw_survey_file_13,'r') as infile: + for line in infile: + data_line_val = re.split(r',',line) + if(len(data_line_val)==8): + mid_string = (re.split(r"MID:",data_line_val[2]))[1] + mid_13[mid_string]=list() + + +with open(mid_13_file,'w') as outfile: + for key in mid_13: + outfile.write(key+'\n') + +mid_12=dict() +with open(raw_survey_file_12,'r') as infile: + for line in infile: + data_line_val = re.split(r',',line) + if(len(data_line_val)==8): + mid_string = (re.split(r"MID:",data_line_val[2]))[1] + mid_12[mid_string]=list() + +with open(mid_12_file,'w') as outfile: + for key in mid_12: + outfile.write(key+'\n') + +different_mid_13 = list() +with open(different_mid_file_13,'w') as outfile: + for key in mid_13: + if(mid_12.has_key(key)==False): + different_mid_13.append(key) + outfile.write(key+'\n') + +different_mid_12 = list() +with open(different_mid_file_12,'w') as outfile: + for key in mid_12: + if(mid_13.has_key(key)==False): + different_mid_12.append(key) + outfile.write(key+'\n') + +i=0 +with open(raw_deup_file,'r') as infile: + for line in infile: + i+=1 + if(i%100000==0): + print i + data_line_val = re.split(r",|MID:|TD:",line) + if(term['data_num'](data_line_val) and \ + mid_13.has_key(str(data_line_val[4])) == True): + mid_13[data_line_val[4]].append(data_line_val[6]) + if(term['data_num'](data_line_val) and \ + mid_12.has_key(str(data_line_val[4])) == True): + mid_12[data_line_val[4]].append(data_line_val[6]) + +td_list_13 =list() +with open(list_13_file,'w') as outfile: + for key in mid_13.keys(): + for td in mid_13[key]: + if(term['not_null'](td) and td_list_13.count(td)==0): + td_list_13.append(td) + outfile.write(td+'\n') + +td_list_12 =list() +with open(list_12_file,'w') as outfile: + for key in mid_12.keys(): + for td in mid_12[key]: + if(term['not_null'](td) and td_list_12.count(td)==0): + td_list_12.append(td) + outfile.write(td+'\n') + +different_list_12 = list() +with open(different_list_file_12,'w') as outfile: + for x in td_list_12: + if(td_list_13.count(x)==0): + different_list_12.append(x) + outfile.write(x+'\n') + +different_list_13 = list() +with open(different_list_file_13,'w') as outfile: + for x in td_list_13: + if(td_list_12.count(x)==0): + different_list_13.append(x) + outfile.write(x+'\n') + +td_dict=dict() +for i in different_list_12: + td_dict[i]=list() + +# i=0 +# with open(run_time_file,'r') as infile: +# for line in infile: +# i+=1 +# if(i%100000==0): +# print i +# if(line.find("NCHK_QUREY__KNOW")!=-1): +# data_line_val = re.split(r',|TD:',line) +# if(td_dict.has_key(data_line_val[6]) == True): +# td_dict[data_line_val[6]].insert(0,"NCHK_QUREY__KNOW"+'\n') +# td_dict[data_line_val[6]].append(line) +# elif(line.find("NCHK_QUREY__UNKNOW")!=-1): +# data_line_val = re.split(r',|TD:',line) +# if(td_dict.has_key(data_line_val[6]) == True): +# td_dict[data_line_val[6]].append(line) +# elif(line.find("NCHK_REPORT__SUCC")!=-1): +# data_line_val = re.split(r',|TD:',line) +# if(td_dict.has_key(data_line_val[6]) == True): +# td_dict[data_line_val[6]].append(line) + +# else: +# continue + + +print len(different_list_12),len(different_list_13),\ +len(td_list_12),len(td_list_13),\ +len(mid_12),len(mid_13),len(different_mid_13),len(different_mid_12) + +# with open(lost_td_line,'w') as outfile: +# for key in td_dict.keys(): +# if(len(td_dict[key])>2 and td_dict[key][0]=="NCHK_QUREY__KNOW"): +# outfile.write(key+':\n') +# for i in td_dict[key]: +# outfile.write(i) + |
