summaryrefslogtreecommitdiff
path: root/src/rssb_statistics/find_lost_td.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/rssb_statistics/find_lost_td.py')
-rw-r--r--src/rssb_statistics/find_lost_td.py147
1 files changed, 147 insertions, 0 deletions
diff --git a/src/rssb_statistics/find_lost_td.py b/src/rssb_statistics/find_lost_td.py
new file mode 100644
index 0000000..50f3fab
--- /dev/null
+++ b/src/rssb_statistics/find_lost_td.py
@@ -0,0 +1,147 @@
+import ConfigParser
+import re
+
+config = ConfigParser.RawConfigParser()
+config.read("find_lost_td.conf")
+raw_survey_file_13 = config.get("file","raw_survey_file_13")
+raw_deup_file = config.get("file","raw_deup_file")
+# run_time_file = config.get("file","run_time_file")
+raw_survey_file_12 = config.get("file","raw_survey_file_12")
+# lost_td_line = config.get("file","lost_td_line")
+mid_12_file = config.get("file","mid_12_file")
+mid_13_file = config.get("file","mid_13_file")
+list_12_file = config.get("file","list_12_file")
+list_13_file = config.get("file","list_13_file")
+different_mid_file_13 = config.get("file","different_mid_file_13")
+different_mid_file_12 = config.get("file","different_mid_file_12")
+different_list_file_13 = config.get("file","different_list_file_13")
+different_list_file_12 = config.get("file","different_list_file_12")
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)>7),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20),
+ 'not_null':(lambda x : len(x)!=0)}
+
+mid_13=dict()
+with open(raw_survey_file_13,'r') as infile:
+ for line in infile:
+ data_line_val = re.split(r',',line)
+ if(len(data_line_val)==8):
+ mid_string = (re.split(r"MID:",data_line_val[2]))[1]
+ mid_13[mid_string]=list()
+
+
+with open(mid_13_file,'w') as outfile:
+ for key in mid_13:
+ outfile.write(key+'\n')
+
+mid_12=dict()
+with open(raw_survey_file_12,'r') as infile:
+ for line in infile:
+ data_line_val = re.split(r',',line)
+ if(len(data_line_val)==8):
+ mid_string = (re.split(r"MID:",data_line_val[2]))[1]
+ mid_12[mid_string]=list()
+
+with open(mid_12_file,'w') as outfile:
+ for key in mid_12:
+ outfile.write(key+'\n')
+
+different_mid_13 = list()
+with open(different_mid_file_13,'w') as outfile:
+ for key in mid_13:
+ if(mid_12.has_key(key)==False):
+ different_mid_13.append(key)
+ outfile.write(key+'\n')
+
+different_mid_12 = list()
+with open(different_mid_file_12,'w') as outfile:
+ for key in mid_12:
+ if(mid_13.has_key(key)==False):
+ different_mid_12.append(key)
+ outfile.write(key+'\n')
+
+i=0
+with open(raw_deup_file,'r') as infile:
+ for line in infile:
+ i+=1
+ if(i%100000==0):
+ print i
+ data_line_val = re.split(r",|MID:|TD:",line)
+ if(term['data_num'](data_line_val) and \
+ mid_13.has_key(str(data_line_val[4])) == True):
+ mid_13[data_line_val[4]].append(data_line_val[6])
+ if(term['data_num'](data_line_val) and \
+ mid_12.has_key(str(data_line_val[4])) == True):
+ mid_12[data_line_val[4]].append(data_line_val[6])
+
+td_list_13 =list()
+with open(list_13_file,'w') as outfile:
+ for key in mid_13.keys():
+ for td in mid_13[key]:
+ if(term['not_null'](td) and td_list_13.count(td)==0):
+ td_list_13.append(td)
+ outfile.write(td+'\n')
+
+td_list_12 =list()
+with open(list_12_file,'w') as outfile:
+ for key in mid_12.keys():
+ for td in mid_12[key]:
+ if(term['not_null'](td) and td_list_12.count(td)==0):
+ td_list_12.append(td)
+ outfile.write(td+'\n')
+
+different_list_12 = list()
+with open(different_list_file_12,'w') as outfile:
+ for x in td_list_12:
+ if(td_list_13.count(x)==0):
+ different_list_12.append(x)
+ outfile.write(x+'\n')
+
+different_list_13 = list()
+with open(different_list_file_13,'w') as outfile:
+ for x in td_list_13:
+ if(td_list_12.count(x)==0):
+ different_list_13.append(x)
+ outfile.write(x+'\n')
+
+td_dict=dict()
+for i in different_list_12:
+ td_dict[i]=list()
+
+# i=0
+# with open(run_time_file,'r') as infile:
+# for line in infile:
+# i+=1
+# if(i%100000==0):
+# print i
+# if(line.find("NCHK_QUREY__KNOW")!=-1):
+# data_line_val = re.split(r',|TD:',line)
+# if(td_dict.has_key(data_line_val[6]) == True):
+# td_dict[data_line_val[6]].insert(0,"NCHK_QUREY__KNOW"+'\n')
+# td_dict[data_line_val[6]].append(line)
+# elif(line.find("NCHK_QUREY__UNKNOW")!=-1):
+# data_line_val = re.split(r',|TD:',line)
+# if(td_dict.has_key(data_line_val[6]) == True):
+# td_dict[data_line_val[6]].append(line)
+# elif(line.find("NCHK_REPORT__SUCC")!=-1):
+# data_line_val = re.split(r',|TD:',line)
+# if(td_dict.has_key(data_line_val[6]) == True):
+# td_dict[data_line_val[6]].append(line)
+
+# else:
+# continue
+
+
+print len(different_list_12),len(different_list_13),\
+len(td_list_12),len(td_list_13),\
+len(mid_12),len(mid_13),len(different_mid_13),len(different_mid_12)
+
+# with open(lost_td_line,'w') as outfile:
+# for key in td_dict.keys():
+# if(len(td_dict[key])>2 and td_dict[key][0]=="NCHK_QUREY__KNOW"):
+# outfile.write(key+':\n')
+# for i in td_dict[key]:
+# outfile.write(i)
+