diff options
Diffstat (limited to 'src/get_td_mistake_lost/new_TD.py')
| -rw-r--r-- | src/get_td_mistake_lost/new_TD.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/new_TD.py b/src/get_td_mistake_lost/new_TD.py new file mode 100644 index 0000000..5b7269f --- /dev/null +++ b/src/get_td_mistake_lost/new_TD.py @@ -0,0 +1,34 @@ +#-*-coding:utf-8-*- +import re +import random +import ConfigParser +import bisect +import commands +import os +import hashlib + +config = ConfigParser.RawConfigParser() +config.read("file_digest.conf") +raw_file_address=config.get("new_td","raw_file_address") +ripe_files_address=config.get("new_td","ripe_files_address") +print ("%s %s" %(raw_file_address,ripe_files_address)) + +def get_md5_value(td_string): + my_md5 = hashlib.md5() + my_md5.update(td_string) + my_md5_string=str(my_md5.hexdigest()) + return my_md5_string + +i=0 +with open(raw_file_address,'r') as infile: + with open(ripe_files_address,'w')as outfile: + for line in infile: + i+=1 + if(i%100000==0): + print i; + data_line_val = re.split(r';',line) + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data_line_val[4]) + td_string=str("url"+data_set[1]+"MediaType:"+data_set[3]+"MediaLen:"+data_set[4] \ + +"Etag:"+data_set[5]+"LastModify:"+data_set[6]+"td_data_md5_32k:"+data_line_val[16]) + new_td=get_md5_value(td_string) + outfile.write(td_string+";"+new_td+";"+data_line_val[19]+"\n")
\ No newline at end of file |
