diff options
Diffstat (limited to 'src/get_td_mistake_lost')
| -rw-r--r-- | src/get_td_mistake_lost/CMakeLists.txt | 11 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/file_digest.conf | 6 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/file_digest.py | 104 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/get_TD_SFH.c | 162 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/get_lost_rate.c | 210 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/get_mistake_level.c | 366 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/get_td_mistake_lost.sh | 5 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/gram_index_engine.c | 1354 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/new_TD.conf | 3 | ||||
| -rw-r--r-- | src/get_td_mistake_lost/new_TD.py | 34 |
10 files changed, 2255 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/CMakeLists.txt b/src/get_td_mistake_lost/CMakeLists.txt new file mode 100644 index 0000000..87f4b6b --- /dev/null +++ b/src/get_td_mistake_lost/CMakeLists.txt @@ -0,0 +1,11 @@ +PROJECT (CALCULATE) +SET (SRC_LIST get_lost_rate.c) +SET(CMAKE_BUILD_TYPE "Debug") +SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb") +SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") +MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR}) +MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR}) +#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/) +#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/) +ADD_EXECUTABLE(get_lost_rate ${SRC_LIST} gram_index_engine.c) +TARGET_LINK_LIBRARIES(get_lost_rate maatframe libMESA_htable.so pthread m) diff --git a/src/get_td_mistake_lost/file_digest.conf b/src/get_td_mistake_lost/file_digest.conf new file mode 100644 index 0000000..6d1c06b --- /dev/null +++ b/src/get_td_mistake_lost/file_digest.conf @@ -0,0 +1,6 @@ +[file_digest] +ripe_files_address = ../../data/ripe_data/td_data_20171207/all_av_digest +raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest +[new_td] +ripe_files_address = ../../data/ripe_data/td_data_20171207/new_TD.txt +raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest diff --git a/src/get_td_mistake_lost/file_digest.py b/src/get_td_mistake_lost/file_digest.py new file mode 100644 index 0000000..62786ef --- /dev/null +++ b/src/get_td_mistake_lost/file_digest.py @@ -0,0 +1,104 @@ +#-*-coding:utf-8-*- +import re +import random +import ConfigParser +import bisect +import commands +import os +import hashlib + +class data_line(object): + """docstring for ClassName""" + def __init__(self): + super(ClassName, self).__init__() + + @staticmethod + def if_error(data_line_str): + data_line_val = re.split(r';',data_line_str) + hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19]) + if(term['data_num'](data_line_val) and \ + term['not_null'](data_line_val[0]) and \ + term['not_null'](data_line_val[1]) and \ + term['not_null'](data_line_val[2]) and \ + term['ysp_len'](data_line_val[3]) and \ + term['not_null'](data_line_val[4]) and \ + term['not_null'](data_line_val[5]) and \ + term['td_len'](data_line_val[6]) and \ + term['td_len'](data_line_val[7]) and \ + term['td_len'](data_line_val[8]) and \ + term['td_len'](data_line_val[9]) and \ + term['td_len'](data_line_val[10]) and \ + term['td_len'](data_line_val[11]) and \ + term['td_len'](data_line_val[12]) and \ + term['td_len'](data_line_val[13]) and \ + term['td_len'](data_line_val[14]) and \ + term['td_len'](data_line_val[15]) and \ + term['td_len'](data_line_val[16]) and \ + term['td_len'](data_line_val[17]) and \ + term['not_null'](data_line_val[18]) and \ + term['sfh_len'](data_line_val[19]) and \ + term['not_null'](data_line_val[20]) and \ + hashed_len/float(data_line_val[3])>0.999): + return data_line_val + else: + return -1 + +class TD_fingerprint(object): + def __init__(): + self.td = td + self.td_string = td_string + @staticmethod + def td_generate(td_string): + td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest() + +class sfh_fingerprint(object): + + def __init__(self,sfh): + self.sfh = sfh + + @staticmethod + def get_hashed_len(sfh): + p = r"\[+\d+?:+\d+?\]" + pattern = re.compile(p) + hashed_len_set = pattern.findall(sfh) + if (term['not_null'](hashed_len_set)): + hashed_len = 0 + for x in xrange(0,len(hashed_len_set)): + hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x]) + hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1]) + return hashed_len/len(hashed_len_set) + else : + return -1 + +term = {'td_len':(lambda x : len(x)==32), + 'data_num':(lambda x : len(x)==21), + 'url':(lambda x : x.find['NUll']), + 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)), + 'not_null':(lambda x : len(x)!=0), + 'ysp_len':(lambda x : int(x)!=0), + 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))} + +c_func="./" +ripe_files=[] +config = ConfigParser.RawConfigParser() +config.read("file_digest.conf") +raw_file_address=config.get("file_digest","raw_file_address") +ripe_files_address=config.get("file_digest","ripe_files_address") +print ("%s %s" %(raw_file_address,ripe_files_address)) +# num = [0,0,0,0,0,0,0] +# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")] +# i=0 +# for i in xrange(0,ripe_file_num): +# outfile=open(ripe_files_address+str(i)+'.txt','w') +# ripe_files.append(outfile) + +i=0 +with open(raw_file_address,'r') as infile: + with open(ripe_files_address,'w')as outfile: + for line in infile: + i+=1 + if(i%10000==0): + print i + line_return = data_line.if_error(line) + if(line_return != -1): + outfile.write(str(line))
\ No newline at end of file diff --git a/src/get_td_mistake_lost/get_TD_SFH.c b/src/get_td_mistake_lost/get_TD_SFH.c new file mode 100644 index 0000000..2ed3ecd --- /dev/null +++ b/src/get_td_mistake_lost/get_TD_SFH.c @@ -0,0 +1,162 @@ +/* +gcc -g get_TD_SFH.c -o get_TD_SFH -lmaatframe -lMESA_htable -I../include +./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level +*/ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define BUFFER_LEN (15*1024) +#define SFH_LEN (10*1024) +#define TD_LEN 33 +#define THREAD_SAFE 0 +#define SLOT_SIZE (1024*1024*16) +#define TD_STR_LEN (10*1024) +#define TIME_STR_LEN 128 + +typedef struct sfh_link +{ + // char *time_str; + char *sfh_str; + char *td_ori; + // char *md5_32k; + int similiar; + int all_similiar; + // long hash_len; + struct sfh_link *next; +}sfh_link; + +typedef struct sfh +{ + int all_num; + int all_similiar; + char *sfh_str; + // long hash_len; + sfh_link *sfh_link_items; +}sfh; + +void print_td_sfh(const uchar *key,uint size,void *data,void *arg) +{ + FILE *ripe_file=(FILE*)arg; + sfh *temp_sfh=(sfh*)data; + fprintf(ripe_file,"%s;%s;%s",key,temp_sfh->sfh_link_items->td_ori,temp_sfh->sfh_str); +} + +int main() +{ + FILE *raw_file; + FILE *ripe_file; + char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt"; + char *ripe_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_3"; + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + if(ripe_file==NULL) + { + printf("open all_av_digest_mistake_level error"); + return -1; + } + MESA_htable_handle htable=NULL; + char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL; + int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0; + unsigned int slot_size=SLOT_SIZE; + sfh *temp_sfh=NULL; + sfh_link *temp_sfh_link=NULL; + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + td_str = (char*)calloc(TD_STR_LEN,sizeof(char)); + // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char)); + // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char)); + // time_str[TIME_STR_LEN-1]='\0'; + // md5_32k_str[32]='\0'; + htable=MESA_htable_born(); + MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); + MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int)); + MESA_htable_mature(htable); + while(feof(raw_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str); + // assert(ret==5); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + td[32]='\0'; + // md5_32k_str[32]='\0'; + if((temp_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + { + temp_sfh=(sfh*)calloc(1,sizeof(sfh)); + temp_sfh->all_num=1; + temp_sfh->all_similiar=0; + temp_sfh->sfh_str=strdup(sfh_str); + temp_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + temp_sfh->sfh_link_items->td_ori=strdup(td_str); + // temp_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str); + // temp_sfh->sfh_link_items->time_str=strdup(time_str); + temp_sfh->sfh_link_items->similiar=0; + temp_sfh->sfh_link_items->all_similiar=0; + temp_sfh->sfh_link_items->next=NULL; + ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_sfh); + assert(ret>0); + } + else + { + temp_similiar=GIE_sfh_similiarity(temp_sfh->sfh_str,(int)strlen(temp_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_sfh->all_similiar+=temp_similiar; + temp_sfh_link=temp_sfh->sfh_link_items; + for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next) + { + temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_sfh_link->all_similiar+=temp_similiar; + temp_all_similiar+=temp_similiar; + if(temp_sfh_link->all_similiar>temp_sfh->all_similiar) + { + free(temp_sfh->sfh_str); + temp_sfh->sfh_str=strdup(temp_sfh_link->sfh_str); + temp_sfh->all_similiar=temp_sfh_link->all_similiar; + } + if(temp_sfh_link->next==NULL) + { + break; + } + } + temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_sfh_link->next->sfh_str=strdup(sfh_str); + temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->time_str=strdup(time_str); + temp_sfh_link->next->similiar=0; + temp_sfh_link->next->all_similiar=temp_all_similiar; + temp_sfh_link->next->next=NULL; + temp_sfh->all_num+=1; + } + } + fclose(raw_file); + MESA_htable_iterate(htable,print_td_sfh,ripe_file); + free(sfh_str); + free(td); + free(td_str); + // free(md5_32k_str); + MESA_htable_destroy(htable,NULL); + // fclose(raw_file); + fclose(ripe_file); + return 0; +}
\ No newline at end of file diff --git a/src/get_td_mistake_lost/get_lost_rate.c b/src/get_td_mistake_lost/get_lost_rate.c new file mode 100644 index 0000000..d983a00 --- /dev/null +++ b/src/get_td_mistake_lost/get_lost_rate.c @@ -0,0 +1,210 @@ +/* +gcc -g get_lost_rate.c -o get_lost_rate -lmaatframe -I../include +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <assert.h> +#include <ctype.h> +#define BUFFER_LEN (10*1024) +#define CACHE_SIZE 2000000 +#define SFH_LEN (10*1024) +#define TD_LEN 33 +#define RESULT_NUM 10000 +#define TIME_STR_LEN 128 +#define TD_STR_LEN (10*1024) + +typedef struct cache +{ + GIE_digest_t ** GIE_cache; + long cache_size; + long len; +}cache; + +typedef struct GIE_tag +{ + char *td; + char *td_str; + char *sfh_str; +}GIE_tag; + +int main() +{ + FILE *td_sfh_file; + FILE *raw_file; + FILE *ripe_file; + const char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt"; + const char *td_sfh_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_1"; + const char *ripe_file_dir="../../data/ripe_data/td_data_20171207/get_lost_ripe_data_1"; + td_sfh_file = fopen(td_sfh_file_dir,"r+"); + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + char *buffer=NULL,*sfh_str=NULL,*td=NULL,*time_str=NULL,*td_str=NULL; + GIE_create_para_t *query_para=NULL; + GIE_handle_t *query_handle=NULL; + GIE_result_t *query_result = NULL; + cache *GIE_digest_cache = NULL; + GIE_digest_t *sfh_td = NULL; + int i=0,w=0,ret=0,lost=0,j=0,update=0,resultnum=0,temp_len=0; + GIE_tag *temp_tag =NULL; + if(td_sfh_file == NULL) + { + printf("open td_sfh_file_dir error\n"); + return -1; + } + if(raw_file == NULL) + { + printf("open raw_file_dir error\n"); + return -1; + } + if(ripe_file == NULL) + { + printf("open ripe_file_dir error\n"); + return -1; + } + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + sfh_str[SFH_LEN-1]='\0'; + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + time_str = (char*)calloc(TIME_STR_LEN,sizeof(char)); + time_str[TIME_STR_LEN-1]='\0'; + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + td_str = (char*)calloc(TD_STR_LEN,sizeof(char)); + query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t)); + query_para->gram_value = 7; + query_para->position_accuracy = 5; + query_para->ED_reexamine=1; + query_para->format=GIE_INPUT_FORMAT_SFH; + query_handle=GIE_create((const GIE_create_para_t *)query_para); + free(query_para); + query_result = (GIE_result_t*)calloc(RESULT_NUM,sizeof(GIE_result_t)); + GIE_digest_cache =(cache*)calloc(1,sizeof(cache)); + GIE_digest_cache->cache_size = CACHE_SIZE; + GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*)); + GIE_digest_cache->len = 0; + if(query_handle==NULL) + { + printf("create GIE handle error\n"); + return -1; + } + while(feof(td_sfh_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,td_sfh_file); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td,td_str,sfh_str); + assert(ret==3); + td[32]='\0'; + sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_td->id=i; + temp_len=strlen(sfh_str); + sfh_td->sfh_length=temp_len; + sfh_str[temp_len-1]='\0'; + sfh_td->operation=GIE_INSERT_OPT; + sfh_td->cfds_lvl=5; + sfh_td->sfh=strdup(sfh_str); + temp_tag=(GIE_tag*)calloc(1,sizeof(GIE_tag)); + temp_tag->td=strdup(td); + temp_tag->td_str=strdup(td_str); + temp_tag->sfh_str=strdup(sfh_str); + sfh_td->tag=(void*)temp_tag; + GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td; + GIE_digest_cache->len++; + // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + // if(resultnum==0) + // { + // sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + // sfh_td->id=i; + // sfh_td->sfh_length=strlen(sfh_str); + // sfh_td->operation=GIE_INSERT_OPT; + // sfh_td->cfds_lvl=5; + // sfh_td->sfh=strdup(sfh_str); + // sfh_td->tag=(void*)strdup(td); + // GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td; + // GIE_digest_cache->len++; + // } + // else + // { + // for(j=0;j<resultnum;j++) + // { + // if(strcmp((char*)((query_result+j)->tag),td)!=0) + // { + // lost++; + // fprintf(ripe_file,"%s,%s,%s\n",(char*)((query_result+j)->tag),td,sfh_str); + // } + // } + // continue; + // } + // if(GIE_digest_cache->len==GIE_digest_cache->cache_size) + // { + // update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size); + // assert(update==GIE_digest_cache->len); + // GIE_digest_cache->len=0; + // for(j=0;j<GIE_digest_cache->cache_size;j++) + // { + // free(GIE_digest_cache->GIE_cache[j]->sfh); + // GIE_digest_cache->GIE_cache[j]->sfh=NULL; + // free(GIE_digest_cache->GIE_cache[j]); + // GIE_digest_cache->GIE_cache[j]=NULL; + // } + // } + // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + // for(i=0;i<resultnum;i++) + // { + // if(strcmp((char*)query_result[i]->tag,td)!=0) + // { + // lost++; + // } + // } + } + fclose(td_sfh_file); + update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->len); + for(j=0;j<GIE_digest_cache->len;j++) + { + free(GIE_digest_cache->GIE_cache[j]->sfh); + GIE_digest_cache->GIE_cache[j]->sfh=NULL; + free(GIE_digest_cache->GIE_cache[j]); + GIE_digest_cache->GIE_cache[j]=NULL; + } + i=0; + while(feof(raw_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + // ret=sscanf(buffer,"%[^;];%[^;]",td,sfh_str); + // assert(ret==2); + // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%[^;];%*[^;];%[^;];%*[^;]",td_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + if(resultnum>1) + { + for(j=0;j<resultnum;j++) + { + if(strcmp(((GIE_tag*)(query_result+j)->tag)->td,td)!=0) + { + w=1; + fprintf(ripe_file,"%u,%s,%s,%s,%s,%s,%s\n",(query_result+j)->id,((GIE_tag*)((query_result+j)->tag))->td_str,((GIE_tag*)((query_result+j)->tag))->td,((GIE_tag*)((query_result+j)->tag))->sfh_str,td_str,td,sfh_str); + } + } + lost+=w; + w=0; + } + + } + printf("%d;%d\n",lost,i); + free(sfh_str); + free(td); + free(time_str); + free(td_str); +}
\ No newline at end of file diff --git a/src/get_td_mistake_lost/get_mistake_level.c b/src/get_td_mistake_lost/get_mistake_level.c new file mode 100644 index 0000000..5f03974 --- /dev/null +++ b/src/get_td_mistake_lost/get_mistake_level.c @@ -0,0 +1,366 @@ +/* +gcc -g get_mistake_level.c -o get_mistake_level -lMESA_htable -lmaatframe -I../../include +./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define THREAD_SAFE 0 +#define SLOT_SIZE (1024*1024*16) +#define SIMILIAR_RATE 90 +#define TD_STR_LEN (10*1024) +#define TIME_STR_LEN 128 +#define RAODONG_RATE 0.1 +#define BUFFER_LEN (15*1024) +#define SFH_LEN (10*1024) +#define TD_LEN 33 + +typedef struct sfh_link +{ + // char *time_str; + char *sfh_str; + char *td_ori; + // char *md5_32k; + int similiar; + int all_similiar; + // long hash_len; + struct sfh_link *next; +}sfh_link; + +typedef struct mistake_sfh +{ + int mistake_num; + int all_num; + int all_similiar; + char *sfh_str; + // long hash_len; + sfh_link *sfh_link_items; +}mistake_sfh; + +typedef struct temp_parameter +{ + int mistake_num; + FILE *ripe_file; +}temp_parameter; + +long get_hashed_len(const char* sfh) +{ + char *data=(char*)malloc(strlen(sfh)+1); + memcpy(data,sfh, strlen(sfh)); + data[strlen(sfh)]='\0'; + char *token=NULL,*sub_token=NULL,*saveptr; + long left_offset=0,right_offset=0,hashed_length=0; + int ret=0,first=0; + for (token = data; ; token= NULL) + { + sub_token= strtok_r(token,"[", &saveptr); + if (sub_token == NULL) + { + break; + } + if(first==0)//jump over the first sub string. + { + first=1; + continue; + } + ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); + if(ret!=2) + { + return 0; + } + assert(ret==2); + hashed_length+=right_offset-left_offset+1; + } + //printf("hashed length=%ld\n",hashed_length); + free(data); + return hashed_length/2; +} + +void print_mistake_td(const uchar *key,uint size,void *data,void *arg) +{ + temp_parameter *parameter = (temp_parameter*)arg; + mistake_sfh *temp_mistake_sfh=(mistake_sfh*)data; + float temp_rate=0; + temp_rate=(float)temp_mistake_sfh->mistake_num/(float)temp_mistake_sfh->all_num; + if(temp_rate>RAODONG_RATE) + { + parameter->mistake_num+=temp_mistake_sfh->mistake_num; + fprintf(parameter->ripe_file,"%d;%s\n",temp_mistake_sfh->mistake_num,temp_mistake_sfh->sfh_str); + sfh_link *temp_sfh_link=temp_mistake_sfh->sfh_link_items; + for(;;temp_sfh_link=temp_sfh_link->next) + { + if(temp_sfh_link==NULL) + { + break; + } + temp_sfh_link->similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str)); + // fprintf(parameter->ripe_file,"%s,%d;%s;%s;%s\n",temp_sfh_link->time_str,temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori,temp_sfh_link->md5_32k); + fprintf(parameter->ripe_file,"%d;%s;%s\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori); + } + fprintf(parameter->ripe_file,"\n"); + } +} + +int main(int argc,char *argv[]) +{ + FILE *raw_file; + FILE *ripe_file; + char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt"; + char *ripe_file_dir="../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_3"; + char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL; + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0; + long temp_hash_len=0; + unsigned int slot_size=SLOT_SIZE; + mistake_sfh *temp_mistake_sfh=NULL; + sfh_link *temp_sfh_link=NULL; + MESA_htable_handle htable=NULL; + temp_parameter *parameter=NULL; + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + + + if(ripe_file==NULL) + { + printf("open all_av_digest_mistake_level error"); + return -1; + } + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + td_str = (char*)calloc(TD_STR_LEN,sizeof(char)); + // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char)); + // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char)); + // time_str[TIME_STR_LEN-1]='\0'; + // md5_32k_str[32]='\0'; + htable=MESA_htable_born(); + MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); + MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int)); + MESA_htable_mature(htable); + parameter=(temp_parameter*)calloc(1,sizeof(temp_parameter)); + parameter->mistake_num=0; + parameter->ripe_file=ripe_file; + while(feof(raw_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + td[32]='\0'; + // md5_32k_str[32]='\0'; + if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + { + temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh)); + temp_mistake_sfh->mistake_num=0; + temp_mistake_sfh->all_num=1; + temp_mistake_sfh->all_similiar=0; + // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str); + temp_mistake_sfh->sfh_str=strdup(sfh_str); + temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str); + // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str); + // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str); + temp_mistake_sfh->sfh_link_items->similiar=0; + temp_mistake_sfh->sfh_link_items->all_similiar=0; + temp_mistake_sfh->sfh_link_items->next=NULL; + ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh); + assert(ret>0); + } + else + { + temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_mistake_sfh->all_similiar+=temp_similiar; + temp_sfh_link=temp_mistake_sfh->sfh_link_items; + for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next) + { + // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE) + // { + // temp_mistake=1; + // } + temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_sfh_link->all_similiar+=temp_similiar; + temp_all_similiar+=temp_similiar; + if(temp_sfh_link->all_similiar>temp_mistake_sfh->all_similiar) + { + free(temp_mistake_sfh->sfh_str); + temp_mistake_sfh->sfh_str=strdup(temp_sfh_link->sfh_str); + temp_mistake_sfh->all_similiar=temp_sfh_link->all_similiar; + } + if(temp_sfh_link->next==NULL) + { + break; + } + } + // if(temp_hash_len>temp_mistake_sfh->hash_len) + // { + // temp_mistake_sfh->hash_len=temp_hash_len; + // free(temp_mistake_sfh->sfh_str); + // temp_mistake_sfh->sfh_str=strdup(sfh_str); + // } + temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_sfh_link->next->sfh_str=strdup(sfh_str); + temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->time_str=strdup(time_str); + temp_sfh_link->next->similiar=0; + temp_sfh_link->next->all_similiar=temp_all_similiar; + temp_sfh_link->next->next=NULL; + temp_mistake_sfh->all_num+=1; + } + } + fclose(raw_file); + raw_file = fopen(raw_file_dir,"r+"); + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + i=0; + while(feof(raw_file)==0) + { + i++; + if(i%10000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + td[32]='\0'; + // md5_32k_str[32]='\0'; + temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN); + assert(temp_mistake_sfh!=NULL); + // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + // { + // temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh)); + // temp_mistake_sfh->num=0; + // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str); + // temp_mistake_sfh->sfh_str=strdup(sfh_str); + // temp_sfh_link=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->sfh_str=strdup(sfh_str); + // temp_sfh_link->td_ori=strdup(td_str); + // temp_sfh_link->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->time_str=strdup(time_str); + // temp_sfh_link->next=NULL; + // temp_mistake_sfh->sfh_link_items=temp_sfh_link; + // ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh); + // assert(ret>0); + // } + // else + // { + // temp_hash_len=get_hashed_len(sfh_str); + // if(temp_hash_len>temp_mistake_sfh->hash_len) + // { + // temp_sfh_link->hash_len=get_hashed_len(); + // free(temp_sfh_link->sfh_str); + // temp_sfh_link->sfh_str=strdup(sfh_str); + // } + temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + if(temp_similiar<SIMILIAR_RATE) + { + temp_mistake_sfh->mistake_num+=1; + } + // if(temp_mistake_sfh->sfh_link_items!=NULL) + // { + // temp_sfh_link=temp_mistake_sfh->sfh_link_items; + // for(;;temp_sfh_link=temp_sfh_link->next) + // { + // // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE) + // // { + // // temp_mistake=1; + // // } + // if(temp_sfh_link->next==NULL) + // { + // break; + // } + // } + // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->next->sfh_str=strdup(sfh_str); + // temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->time_str=strdup(time_str); + // temp_sfh_link->next->similiar=temp_similiar; + // temp_sfh_link->next->next=NULL; + // } + // else + // { + // temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + // temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str); + // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str); + // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str); + // temp_mistake_sfh->sfh_link_items->similiar=temp_similiar; + // temp_mistake_sfh->sfh_link_items->next=NULL; + // } + // if(temp_mistake==1) + // { + // temp_mistake_sfh->num+=temp_mistake; + // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->next->sfh_str=strdup(sfh_str); + // temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->next=NULL; + // temp_mistake=0; + // } + } + fclose(raw_file); + // raw_file=NULL; + // raw_file = fopen(raw_file_dir,"r+"); + // if(raw_file==NULL) + // { + // printf("open all_av_digest error\n"); + // return -1; + // } + // i=0; + // while(feof(raw_file)==0) + // { + // i++; + // if(i%10000==0) + // { + // printf("%d\n",i); + // } + // fgets(buffer,BUFFER_LEN-1,raw_file); + // buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%[^;];%*[^;];%*[^;];%*[^;]",td); + // assert(ret==1); + // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))!=NULL) + // { + // fprintf(ripe_file,"%d;%s",temp_mistake_sfh->num,buffer); + // } + // } + MESA_htable_iterate(htable,print_mistake_td,(void*)parameter); + printf("%d,%d\n",parameter->mistake_num,i); + free(buffer); + free(sfh_str); + free(td); + free(td_str); + // free(md5_32k_str); + MESA_htable_destroy(htable,NULL); + // fclose(raw_file); + fclose(ripe_file); + return 0; +}
\ No newline at end of file diff --git a/src/get_td_mistake_lost/get_td_mistake_lost.sh b/src/get_td_mistake_lost/get_td_mistake_lost.sh new file mode 100644 index 0000000..7c851b8 --- /dev/null +++ b/src/get_td_mistake_lost/get_td_mistake_lost.sh @@ -0,0 +1,5 @@ +#!/bin/bash +python new_TD.py +./get_mistake_level +./get_TD_SFH +./get_lost_rate diff --git a/src/get_td_mistake_lost/gram_index_engine.c b/src/get_td_mistake_lost/gram_index_engine.c new file mode 100644 index 0000000..0f503db --- /dev/null +++ b/src/get_td_mistake_lost/gram_index_engine.c @@ -0,0 +1,1354 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<math.h> +#include<assert.h> +#include<MESA/MESA_htable.h> +#include<unistd.h> + +#include "gram_index_engine.h" +#include "queue.h" + +#define HTABLE_SIZE 1024 *1024 +#define GRAM_CNT_MAX 2 +#define GRAM_MAX 128 +#define TOLERENCE_SIZE 0 +#define UNION_INIT_SIZE 1000 +#define BLOCKSIZE_MIN 3 +#define MEM_OCCUPY 1 +#define CNT_MAX 10 +#define GRAM_CNT_THRESHOLD 10 +#define QUERY_LEN_ACCURACY 0.1 +#define HTABLE_NUM 8 +//#define GIE_INPUT_FORMAT_SFH 1 +//#define GIE_INPUT_FORMAT_PLAIN 0 +#define MAX_LENGTH 10000 +#define KEY_MAX_LENGTH 10 +#define EDIT_DISTN_INSERT_COST 1 +#define EDIT_DISTN_REMOVE_COST 1 +#define EDIT_DISTN_REPLACE_COST 2 +#define MIN(x,y) ((x)<(y)?(x):(y)) + +int before(unsigned int off1, unsigned int off2) +{ + return (signed int)(off1-off2)<0; +} +#define after(off2,off1) before(off1,off2) + +typedef struct +{ + unsigned int user_gram_value; + unsigned int user_position_accuracy; + short ED_reexamine; + short input_format; + MESA_htable_handle id_table; + MESA_htable_handle index_table[HTABLE_NUM]; + unsigned long long mem_occupy; + unsigned long long hash_cnt; +}GIE_handle_inner_t; + + +struct linklist_node +{ + short * position; + struct id_table_data * basicinfo; + short size; + short index; + unsigned long long blocksize; + TAILQ_ENTRY(linklist_node) listentry; +}; + + +struct index_table_data +{ + struct TQ * listhead; + int cnt; +}; + + +struct id_table_data +{ + unsigned int id; + short sfh_length; + short gram_cnt; + unsigned long long blocksize; + char * sfh; + void * tag; + char cfds_lvl; +}; + + +struct htable_handle +{ + MESA_htable_handle runtime_table; + MESA_htable_handle para; +}; + +struct key_list_node +{ + char * key; + int digest_id; + int pos; + unsigned long long blocksize; + TAILQ_ENTRY(key_list_node) keylistentry; +}; + + +unsigned long long hash_cnt; +unsigned long long cnt_sum; + +TAILQ_HEAD(TQ, linklist_node); +TAILQ_HEAD(KL, key_list_node); + +void idtable_free(void * data); +void indextable_free(void * data); +int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2); +int GIE_insert_indextable(MESA_htable_handle handle, struct id_table_data * info, char * key, unsigned int index,unsigned long long blocksize); + +int GIE_delete_from_indextable_by_key(MESA_htable_handle handle, char * key, unsigned int id); +int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t * digest); +int GIE_cmp(const void * a, const void * b); +inline unsigned int get_real_length(const char * string, unsigned int length); +void print_item_iterate(const uchar * key, unsigned int size, void * data, void * user); +inline unsigned long long calc_fh_blocksize(unsigned long long orilen); +inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len); + +MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data)); +void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user); +void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user); + +GIE_handle_t * GIE_create(const GIE_create_para_t * para) +{ + int i = 0; + GIE_handle_inner_t * handle = (GIE_handle_inner_t *)calloc(1, sizeof(GIE_handle_inner_t)); + handle->mem_occupy = 0; + handle->mem_occupy += sizeof(GIE_handle_inner_t); + + handle->user_gram_value = para->gram_value; + handle->user_position_accuracy = para->position_accuracy; + handle->input_format = para->format; + //handle->user_cmp = GIE_INPUT_FORMAT_PLAIN; + handle->ED_reexamine = para->ED_reexamine; + handle->hash_cnt = 0; + + + MESA_htable_create_args_t idtable_args,indextable_args[HTABLE_NUM]; + memset(&idtable_args, 0, sizeof(idtable_args)); + idtable_args.thread_safe = 0; + idtable_args.hash_slot_size = HTABLE_SIZE; + idtable_args.max_elem_num = 0; + idtable_args.expire_time = 0; + idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; + idtable_args.key_comp = NULL; + idtable_args.key2index = NULL; + idtable_args.data_free = idtable_free; + idtable_args.data_expire_with_condition = NULL; + idtable_args.recursive = 0; + handle->id_table = MESA_htable_create(&idtable_args, sizeof(idtable_args)); + + for(i = 0;i < HTABLE_NUM;i++) + { + memset(&indextable_args[i], 0, sizeof(indextable_args[i])); + indextable_args[i].thread_safe = 0; + indextable_args[i].hash_slot_size = HTABLE_SIZE; + indextable_args[i].max_elem_num = 0; + indextable_args[i].expire_time = 0; + indextable_args[i].eliminate_type = HASH_ELIMINATE_ALGO_FIFO; + indextable_args[i].key_comp = key_compare; + indextable_args[i].key2index = NULL; + indextable_args[i].data_free = indextable_free; + indextable_args[i].data_expire_with_condition = NULL; + indextable_args[i].recursive = 0; + handle->index_table[i] = MESA_htable_create(&indextable_args[i], sizeof(indextable_args[i])); + } + + return (GIE_handle_t *)(handle); +} + +int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2) +{ + return ( (*(long*)key1) - (*(long*)key2)); +} + + +void idtable_free(void * data) +{ + struct id_table_data * tmp = (struct id_table_data *)data; + free(tmp->sfh); + tmp->sfh = NULL; + tmp->tag = NULL; + free(tmp); + tmp = NULL; + + return; +} + +void indextable_delete_with_threshold(MESA_htable_handle * htable_handle, struct index_table_data * tmp, char * key) +{ + int key_length = strnlen(key,KEY_MAX_LENGTH); + struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead); + while(tmp_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node,listentry); + if(tmp_node->basicinfo->gram_cnt <= GRAM_CNT_THRESHOLD) + { + tmp_node = linklist_tmp; + continue; + } + TAILQ_REMOVE(tmp->listhead, tmp_node, listentry); + tmp_node->basicinfo->gram_cnt--; + tmp->cnt--; + if(TAILQ_EMPTY(tmp->listhead) == 1) + { + //_handle->hash_cnt--; + //_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ)); + if(MESA_htable_del(htable_handle, (const uchar *)(key), key_length, indextable_free) < 0) + { + printf("indextable backtrack delete error!\n"); + assert(0); + return; + } + } + //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp_node->size)); + free(tmp_node->position); + tmp_node->position = NULL; + free(tmp_node); + tmp_node = NULL; + tmp_node = linklist_tmp; + + } + return; +} + + +void indextable_free(void * data) +{ + struct index_table_data * tmp = (struct index_table_data *)data; + struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead); + while(tmp_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry); + TAILQ_REMOVE(tmp->listhead, tmp_node, listentry); + tmp->cnt--; + free(tmp_node->position); + tmp_node->position = NULL; + free(tmp_node); + tmp_node = NULL; + tmp_node = linklist_tmp; + } + free(tmp->listhead); + tmp->listhead = NULL; + free(tmp); + tmp = NULL; + return; +} + + +void indextable_free_cnt(void * data) +{ + struct index_table_data * tmp = (struct index_table_data *)data; + hash_cnt++; + cnt_sum += tmp->cnt; + struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead); + while(tmp_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry); + TAILQ_REMOVE(tmp->listhead, tmp_node, listentry); + tmp->cnt--; + free(tmp_node->position); + tmp_node->position = NULL; + free(tmp_node); + tmp_node = NULL; + tmp_node = linklist_tmp; + } + free(tmp->listhead); + tmp->listhead = NULL; + free(tmp); + tmp = NULL; + return; +} + +void print_item_iterate_idtable(const uchar * key, uint size, void * data, void * user) +{ + struct id_table_data * id_data = (struct id_table_data *)data; + printf("id:%u\n",id_data->id); +} + + + +void print_item_iterate(const uchar * key, uint size, void * data, void * user) +{ + struct index_table_data * index_data = (struct index_table_data *)data; + printf("%s %d\n", (char *)key, index_data->cnt); + struct linklist_node * tmp_node = NULL; + int i = 0; + TAILQ_FOREACH(tmp_node, index_data->listhead, listentry) + { + printf("id = %u\n",tmp_node->basicinfo->id); + printf("position is :\n"); + for(i = 0;i < tmp_node->index;i++) + { + printf("%d ",tmp_node->position[i]); + } + printf("\n"); + } + printf("\n"); +} + +int edit_distn(const char *s1, int s1len, const char *s2, int s2len) +{ + long int max_len = 0; + if(s1len >= s2len) + { + max_len = s1len; + } + else + { + max_len = s2len; + } + int **t = (int **)malloc(2*sizeof(int *)); + t[0] = (int *)malloc((max_len +1)*sizeof(int)); + t[1] = (int *)malloc((max_len +1)*sizeof(int)); + //int t[2][EDIT_DISTN_MAXLEN+1]; + int *t1 = t[0]; + int *t2 = t[1]; + int *t3; + size_t i1, i2; + for (i2 = 0; i2 <= s2len; i2++) + t[0][i2] = i2 * EDIT_DISTN_REMOVE_COST; + for (i1 = 0; i1 < s1len; i1++) { + t2[0] = (i1 + 1) * EDIT_DISTN_INSERT_COST; + for (i2 = 0; i2 < s2len; i2++) { + int cost_a = t1[i2+1] + EDIT_DISTN_INSERT_COST; + int cost_d = t2[i2] + EDIT_DISTN_REMOVE_COST; + int cost_r = t1[i2] + (s1[i1] == s2[i2] ? 0 : EDIT_DISTN_REPLACE_COST); + t2[i2+1] = MIN(MIN(cost_a, cost_d), cost_r); + } + t3 = t1; + t1 = t2; + t2 = t3; + } + long int ret = t1[s2len]; + free(t[0]); + free(t[1]); + free(t); + return ret; + //return t1[s2len]; +} + + +void GIE_destory(GIE_handle_t * handle) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle); + //printf("hash_cnt:%llu\n",_handle->hash_cnt); + //printf("mem_occupy:%llu\n",_handle->mem_occupy); + int i = 0; + for(i = 0;i < HTABLE_NUM;i++) + { + MESA_htable_destroy(_handle->index_table[i], indextable_free_cnt); + } + MESA_htable_destroy(_handle->id_table, idtable_free); + //printf("index_free hash_cnt :%llu\n", hash_cnt); + //printf("cnt sum :%llu\n",cnt_sum); + free(_handle); + _handle = NULL; +} + + +int grab_key_set(char * str_begin,short str_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list, unsigned long long blocksize) +{ + int k = 0,j = 0; + char * tmp_gram = str_begin; + char key[gram_value+1]; + int sum = 0,htable_index = 0; + if(str_length < gram_value) + { + return 0; + } + str_length = MIN(str_length,strnlen(str_begin,str_length)); + *gram_cnt = str_length - gram_value + 1; + //printf("str_length:%d\n",str_length); + for(k = 0; k < str_length - gram_value + 1; k++) + { + sum = 0; + memset(key,'\0', gram_value+1); + memcpy(key, tmp_gram++, gram_value); + //printf("k:%d key:%s\n",k,key); + for(j = 0; j < gram_value; j++) + { + sum += key[j]; + } + htable_index = sum%HTABLE_NUM; + struct key_list_node *tmp_node = (struct key_list_node *)calloc(1,sizeof(struct key_list_node)); + tmp_node->key = (char *)calloc(gram_value+1,sizeof(char)); + memcpy(tmp_node->key,key,gram_value); + tmp_node->digest_id = i; + tmp_node->pos = k; + tmp_node->blocksize = blocksize; + TAILQ_INSERT_TAIL(to_process_list[htable_index], tmp_node, keylistentry); + } + return 1; +} +int sfh_grab_key_set(char *sfh,short sfh_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list) +{ + int t = 0; + char * tmp_gram = sfh; + unsigned long long blocksize = 0; + for(t = 0; t < 2;t++) + { + blocksize = get_blocksize_from_head(tmp_gram, sfh_length); + while(*tmp_gram != '\0') + { + if(*tmp_gram == ':') + { + tmp_gram++; + break; + } + tmp_gram++; + } + unsigned int real_length = get_real_length(tmp_gram, sfh_length); + if(real_length < gram_value) + { + if(t==0) + { + return 0; + } + else + { + continue; + } + } + grab_key_set(tmp_gram, real_length, i, gram_value, gram_cnt, to_process_list, blocksize); + while(*tmp_gram != '\0') + { + if(*tmp_gram == '#') + { + tmp_gram++; + break; + } + tmp_gram++; + } + } + return 1; +} + +void free_key_set(struct KL ** to_process_list,int size) +{ + int i = 0; + for(i = 0;i < size;i++) + { + struct key_list_node *tmp_node = TAILQ_FIRST(to_process_list[i]); + while(tmp_node != NULL) + { + struct key_list_node *key_list_tmp = TAILQ_NEXT(tmp_node, keylistentry); + TAILQ_REMOVE(to_process_list[i], tmp_node, keylistentry); + free(tmp_node->key); + tmp_node->key = NULL; + free(tmp_node); + tmp_node = NULL; + tmp_node = key_list_tmp; + } + free(to_process_list[i]); + to_process_list[i]= NULL; + } +} + +int GIE_update(GIE_handle_t * handle,GIE_digest_t * * digests,int size) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle); + struct id_table_data * info = NULL; + int success_cnt = 0; + int m = 0, i = 0, grab_ret = 0; + short gram_cnt = 0; + unsigned int input_fh_len = 0; + unsigned int gram_value = _handle->user_gram_value; + struct KL* to_process_list[HTABLE_NUM]; + + MESA_htable_handle htable_index_copy; + MESA_htable_handle htable_id_copy; + MESA_htable_handle htable_tmp_index=NULL,htable_tmp_id=NULL; + struct htable_handle * htable_copied_id_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle)); + struct htable_handle * htable_copied_index_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle)); + + htable_copied_id_para->runtime_table = _handle->id_table; + htable_copied_id_para->para = NULL; + htable_id_copy = copy_htable((void *)htable_copied_id_para, copy_idtable_item_iterate,idtable_free); + + MESA_htable_handle garbage_htable[HTABLE_NUM]; + /*if(MESA_htable_iterate(htable_id_copy, print_item_iterate_idtable, NULL) == -1) + { + printf("iterate error!\n"); + } + printf("size:%u\n",id_size);*/ + + for(m = 0;m < HTABLE_NUM;m++) + { + to_process_list[m]=(struct KL*)calloc(1,sizeof(struct KL)); + TAILQ_INIT(to_process_list[m]); + } + + for(i = 0; i < size; i++) + { + switch(digests[i]->operation) + { + case GIE_INSERT_OPT: + { + assert(digests[i]->tag!=NULL); + if(_handle->input_format == GIE_INPUT_FORMAT_SFH) + { + grab_ret = sfh_grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list); + } + else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN) + { + + grab_ret = grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list,0); + } + if(grab_ret == 0) + { + continue; + } + else + { + info = (struct id_table_data *)calloc(1,sizeof(struct id_table_data)); + input_fh_len = digests[i]->sfh_length; + info->sfh = (char *)calloc(input_fh_len + 1,sizeof(char)); + memcpy(info->sfh, digests[i]->sfh, input_fh_len); + _handle->mem_occupy += sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1); + info->sfh_length = digests[i]->sfh_length; + info->gram_cnt = gram_cnt; + + /*int tag_len = strnlen(digests[i]->tag,MAX_LENGTH); + info->tag = (char *)calloc(tag_len+1,sizeof(char)); + memcpy(info->tag,digests[i]->tag,tag_len);*/ + info->tag = digests[i]->tag; + + info->id = digests[i]->id; + info->cfds_lvl = digests[i]->cfds_lvl; + if(_handle->input_format == GIE_INPUT_FORMAT_SFH) + { + info->blocksize = get_blocksize_from_head(digests[i]->sfh, digests[i]->sfh_length); + } + else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN) + { + info->blocksize = 0; + } + + if(MESA_htable_add(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), (const void *)info) < 0) + { + _handle->mem_occupy -= (sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1)); + free(info->sfh); + info->sfh = NULL; + free(info); + info = NULL; + continue; + } + } + success_cnt ++; + break; + } + + case GIE_DELETE_OPT: + { + + struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(htable_id_copy, \ + (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id)); + if(ret!= NULL) + { + if(_handle->input_format == GIE_INPUT_FORMAT_SFH) + { + success_cnt += sfh_grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list); + } + else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN) + { + + success_cnt += grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list,0); + } + } + else + { + break; + } + if(MESA_htable_del(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), idtable_free) < 0) + { + printf("delete id failed!"); + assert(0); + } + //success_cnt += GIE_delete(_handle, digests[i]); + break; + } + + default: + break; + } + + } + unsigned int digest_id = 0; + struct id_table_data * tmp_info= NULL; + + for(i = 0;i < HTABLE_NUM;i++) + { + htable_copied_index_para->runtime_table = _handle->index_table[i]; + htable_copied_index_para->para = htable_id_copy; + htable_index_copy = copy_htable((void *)htable_copied_index_para,copy_indextable_item_iterate,indextable_free); + struct key_list_node * tmp_node; + TAILQ_FOREACH(tmp_node, to_process_list[i], keylistentry) + { + digest_id = tmp_node->digest_id; + if(digests[digest_id]->operation == GIE_INSERT_OPT) + { + tmp_info =(struct id_table_data *)MESA_htable_search(htable_id_copy, (const uchar *)(&(digests[digest_id])->id), \ + sizeof((digests[digest_id])->id)); + if(tmp_info == NULL) + { + printf("id %u not insert\n",digests[digest_id]->id); + } + if(GIE_insert_indextable(htable_index_copy, tmp_info, tmp_node->key, tmp_node->pos,tmp_node->blocksize) < 0) + { + printf("insert %d indextable failed!\n",digests[digest_id]->id); + continue; + } + } + else if(digests[digest_id]->operation == GIE_DELETE_OPT) + { + if(GIE_delete_from_indextable_by_key(htable_index_copy, tmp_node->key, (digests[digest_id])->id) < 0) + { + printf("delete %d indextable failed!\n",digests[digest_id]->id); + continue; + } + } + } + htable_tmp_index= _handle->index_table[i]; + _handle->index_table[i] = htable_index_copy; + garbage_htable[i]=htable_tmp_index; + } + + htable_tmp_id = _handle->id_table; + _handle->id_table = htable_id_copy; + usleep(200); + MESA_htable_destroy(htable_tmp_id, idtable_free); + /*if(MESA_htable_iterate(_handle->index_table, print_item_iterate, NULL) == -1) + { + printf("iterate error!\n"); + }*/ + for(i=0;i<HTABLE_NUM;i++) + { + MESA_htable_destroy(garbage_htable[i], indextable_free_cnt); + + } + free_key_set(to_process_list,HTABLE_NUM); + free(htable_copied_id_para); + htable_copied_id_para = NULL; + free(htable_copied_index_para); + htable_copied_index_para = NULL; + return success_cnt; +} + + +MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data)) +{ + MESA_htable_create_args_t copy_table_args; + memset(©_table_args, 0, sizeof(copy_table_args)); + copy_table_args.thread_safe = 0; + copy_table_args.hash_slot_size = HTABLE_SIZE; + copy_table_args.max_elem_num = 0; + copy_table_args.expire_time = 0; + copy_table_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; + copy_table_args.key_comp = NULL; + copy_table_args.key2index = NULL; + copy_table_args.data_free = free_fuc; + copy_table_args.data_expire_with_condition = NULL; + copy_table_args.recursive = 0; + MESA_htable_handle copy_htable_handle = MESA_htable_create(©_table_args, sizeof(copy_table_args)); + + struct htable_handle * htable_copied_para = (struct htable_handle *)htable_para; + struct htable_handle * htable_iterate_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle)); + htable_iterate_para->runtime_table = copy_htable_handle; + htable_iterate_para->para = htable_copied_para->para; + + if(MESA_htable_iterate(htable_copied_para->runtime_table, func, htable_iterate_para) == -1) + { + printf("iterate error!\n"); + } + free(htable_iterate_para); + htable_copied_para=NULL; + return copy_htable_handle; +} + +void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user) +{ + struct index_table_data * index_data = (struct index_table_data *)data; + struct htable_handle * htable_copied_para = (struct htable_handle *)user; + + struct index_table_data * index_data_copy = (struct index_table_data *)calloc(1, sizeof(struct index_table_data)); + struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ)); + index_data_copy->listhead = head; + index_data_copy->cnt = index_data->cnt; + + TAILQ_INIT(head); + struct linklist_node * tmp_node = NULL; + struct id_table_data * ret = NULL; + int i = 0; + + TAILQ_FOREACH(tmp_node, index_data->listhead, listentry) + { + struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node)); + node_data->size = tmp_node->size; + node_data->position = (short *)calloc(node_data->size, sizeof(short)); + for(i = 0;i < tmp_node->index;i++) + { + node_data->position[i] = tmp_node->position[i]; + } + ret = (struct id_table_data *)MESA_htable_search(htable_copied_para->para, (const uchar *)(&(tmp_node->basicinfo->id)), sizeof(tmp_node->basicinfo->id)); + if(ret == NULL) + { + //printf("copy id %u not exist\n",tmp_node->basicinfo->id); + free(node_data->position); + node_data->position = NULL; + free(node_data); + node_data = NULL; + continue; + } + node_data->basicinfo = ret; + node_data->index = tmp_node->index; + node_data->blocksize = tmp_node->blocksize; + TAILQ_INSERT_TAIL(head, node_data, listentry); + } + MESA_htable_add(htable_copied_para->runtime_table, key, size, (const void *)index_data_copy); +} +//TODO: Using the orginal value instead of make a duplication to be faster. +void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user) +{ + struct id_table_data * id_data = (struct id_table_data *)data; + struct htable_handle * htable_para = (struct htable_handle *)user; + struct id_table_data * id_data_copy = (struct id_table_data *)calloc(1, sizeof(struct id_table_data)); + assert(id_data->tag!=NULL); + memcpy(id_data_copy,id_data,sizeof(struct id_table_data)); + id_data_copy->sfh = (char *)calloc(id_data_copy->sfh_length,sizeof(char)); + memcpy(id_data_copy->sfh,id_data->sfh,id_data_copy->sfh_length); + + MESA_htable_add(htable_para->runtime_table, (const uchar *)(&(id_data_copy->id)), sizeof(id_data_copy->id), (const void *)id_data_copy); +} + + + + +int GIE_insert_indextable(MESA_htable_handle htable_copy, struct id_table_data * info, char * key, unsigned int index, unsigned long long blocksize) +{ + int key_length = strnlen(key,KEY_MAX_LENGTH); + struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node)); + node_data->size = GRAM_CNT_MAX; + node_data->position = (short *)calloc(node_data->size, sizeof(short)); + node_data->basicinfo = info; + node_data->index = 0; + node_data->position[(node_data->index)++] = index; + node_data->blocksize = blocksize; + + //_handle->mem_occupy += sizeof(struct linklist_node) + sizeof(short)*(node_data->size); + + struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable_copy, \ + (const uchar *)(key), key_length)); + + + if(ret != NULL) + { + struct linklist_node * tmp = NULL; + TAILQ_FOREACH(tmp, ret->listhead, listentry) + { + if(tmp->basicinfo->id > node_data->basicinfo->id) + { + TAILQ_INSERT_BEFORE(tmp, node_data, listentry); + ret->cnt ++; + if(ret->cnt >= CNT_MAX) + { + indextable_delete_with_threshold(htable_copy,ret,key); + } + return 0; + } + if(tmp->basicinfo->id == node_data->basicinfo->id && tmp->blocksize == blocksize) + { + if(tmp->index >= tmp->size) + { + tmp->size *= 2; + tmp->position = realloc(tmp->position, (tmp->size)*sizeof(short)); + } + tmp->position[(tmp->index)++] = index; + //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(node_data->size)); + free(node_data->position); + node_data->position = NULL; + free(node_data); + node_data = NULL; + return 0; + } + } + TAILQ_INSERT_TAIL(ret->listhead, node_data, listentry); + ret->cnt ++; + if(ret->cnt >= CNT_MAX) + { + indextable_delete_with_threshold(htable_copy,ret,key); + } + } + + else + { + struct index_table_data * index_data = (struct index_table_data *)calloc(1, sizeof(struct index_table_data)); + struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ)); + //_handle->mem_occupy += sizeof(struct index_table_data) + sizeof(struct TQ); + + index_data->listhead = head; + index_data->cnt = 0; + + TAILQ_INIT(head); + TAILQ_INSERT_TAIL(head, node_data, listentry); + index_data->cnt++; + //_handle->hash_cnt++; + if(MESA_htable_add(htable_copy, (const uchar *)(key), key_length, (const void *)index_data) < 0) + { + printf("add index_table failed!\n"); + assert(0); + return -1; + } + } + return 0; + +} + + + +int GIE_delete(GIE_handle_inner_t * _handle, GIE_digest_t * digest) +{ + int success_cnt = 0; + struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(_handle->id_table, \ + (const uchar *)(&(digest->id)), sizeof(digest->id)); + if(ret == NULL) + { + printf("del %d doesn't exist!\n",digest->id); + return -1; + } + else + { + int gram_value = _handle->user_gram_value; + char key[gram_value+1]; + char * tmp_gram = ret->sfh; + while(*tmp_gram != '\0') + { + if(*tmp_gram == ':') + { + tmp_gram++; + break; + } + tmp_gram++; + } + unsigned int real_length = get_real_length(tmp_gram, ret->sfh_length); + int gram_cnt = real_length - gram_value + 1; + int k = 0; + for(k = 0; k < gram_cnt; k++) + { + memset(key, '\0', gram_value+1); + memcpy(key, tmp_gram++, gram_value); + if(GIE_delete_from_indextable_by_key(_handle, key, digest->id) < 0) + { + printf("delete %d indextable failed!\n",digest->id); + continue; + } + } + success_cnt++; + } + + return success_cnt; +} + + + +int GIE_delete_from_indextable_by_key(MESA_htable_handle htable, char * key, unsigned int id) +{ + int key_length = strnlen(key,KEY_MAX_LENGTH); + struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable, \ + (const uchar *)(key), key_length)); + if(ret == NULL) + { + return 0; + } + + + struct linklist_node * tmp = TAILQ_FIRST(ret->listhead); + while(tmp != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp, listentry); + if(tmp->basicinfo->id != id) + { + tmp=linklist_tmp; + continue; + } + TAILQ_REMOVE(ret->listhead, tmp, listentry); + ret->cnt--; + //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp->size)); + free(tmp->position); + tmp->position = NULL; + free(tmp); + tmp = NULL; + if(TAILQ_EMPTY(ret->listhead) == 1) + { + //_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ)); + int ret = MESA_htable_del(htable, (const uchar *)(key), key_length, indextable_free); + if(ret < 0) + { + printf("indextable backtrack delete error!\n"); + assert(0); + return -1; + } + + } + } + return 0; +} + + + + +int GIE_cmp(const void * a, const void * b) +{ + unsigned int tmp_a = *(unsigned int *)a; + unsigned int tmp_b = *(unsigned int *)b; + if(before(tmp_a, tmp_b)) + { + return -1; + } + else if(after(tmp_a, tmp_b)) + { + return 1; + } + else + { + return 0; + } +} + + +inline unsigned int get_real_length(const char * string, unsigned int length) +{ + unsigned int ret = 0; + const char * tmp_str = string; + while(*tmp_str != '\0') + { + if(*tmp_str == '[') + { + break; + } + tmp_str++; + ret ++; + } + return ret; +} + + +inline int GIE_part_query(GIE_handle_inner_t * _handle, const char * query_string, int index_begin, int part_query_len,unsigned int ** id_union, unsigned int * union_index, unsigned int * union_size, unsigned long long blocksize) +{ + unsigned int gram_value = _handle->user_gram_value; + + unsigned int real_length = part_query_len; + unsigned int chunk_count_max = 0; + if(real_length < gram_value) + { + return 0; + } + else + { + chunk_count_max = real_length/gram_value; + } + char key[gram_value+1]; + struct index_table_data * ret = NULL; + struct linklist_node * tmp_node_t = NULL; + + unsigned int position_accuracy = _handle->user_position_accuracy; + + int i=0,j=0,k=0; + unsigned int tmp_min = 0; + int sum = 0, htable_index = 0; + for(i = index_begin; i < chunk_count_max + index_begin; i++) + { + sum = 0; + memset(key,'\0',gram_value+1); + memcpy(key, query_string, gram_value); + for(k = 0; k < gram_value; k++) + { + sum += key[k]; + } + htable_index = sum%HTABLE_NUM; + ret = (struct index_table_data *) MESA_htable_search(_handle->index_table[htable_index], \ + (const uchar *)(key), strnlen(key,gram_value)); + query_string = query_string + gram_value; + + if(ret ==NULL) + { + break; + } + + tmp_node_t = NULL; + TAILQ_FOREACH(tmp_node_t, ret->listhead, listentry) + { + tmp_min = 0; + if(i*gram_value >= position_accuracy) + { + tmp_min = i*gram_value - position_accuracy; + } + for(j = 0; j < tmp_node_t->index; j++) + { + if((blocksize == tmp_node_t->basicinfo->blocksize) && (tmp_node_t->position[j] >= tmp_min) && (tmp_node_t->position[j] <= i*gram_value + position_accuracy)) + //if(blocksize == tmp_node_t->basicinfo->blocksize) + { + if((*union_index) >= (*union_size)) + { + *union_size = (*union_size) * 2; + *id_union = (unsigned int *)realloc(*id_union, (*union_size)*sizeof(unsigned int)); + } + (*id_union)[(*union_index)] = tmp_node_t->basicinfo->id; + (*union_index)++; + break; + } + } + } + } + return chunk_count_max; +} + +inline int GIE_gram_with_position(GIE_handle_inner_t * _handle, unsigned long long query_blocksize, const char * fuzzy_string, unsigned int ** id_union, + unsigned int * union_index,unsigned int * union_size, unsigned int * chunk_cnt) +{ + const char * tmpstr = fuzzy_string; + const char * query_string_begin; + unsigned long long blocksize = query_blocksize; + int part_query_len = 0; + int query_actual_len = 0; + while(*tmpstr != ':'&& *tmpstr != '\0') + { + tmpstr ++; + } + if(*tmpstr == ':') + { + tmpstr ++; + } + else + { + return 0; + } + query_string_begin = tmpstr; + char *p = NULL; + + while((*query_string_begin) != '\0') + { + int left = 0; + int right = 0; + p=strchr(query_string_begin,'['); + if(p!=NULL) + { + part_query_len = p-query_string_begin; + int ret = sscanf(p,"[%d:%d]",&left,&right); + if(ret != 2) + { + break; + } + p=strchr(p,']'); + if(p != NULL && (*p) != '\0') + { + int index_begin = (left/blocksize - TOLERENCE_SIZE > 0 ? (left/blocksize - TOLERENCE_SIZE) : 0); + (*chunk_cnt) += GIE_part_query(_handle,query_string_begin,index_begin, part_query_len, + id_union, union_index, union_size, blocksize); + query_actual_len += part_query_len; + query_string_begin = p+1; + } + else + { + break; + } + } + else + { + break; + } + } + return query_actual_len; +} + +inline unsigned long long calc_fh_blocksize(unsigned long long orilen) +{ + double tmp = orilen/(64 * BLOCKSIZE_MIN); + double index = floor(log(tmp)/log(2)); + double tmp_t = pow(2,index); + unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN); + return blocksize; +} + +inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len) +{ + const char * tmp_str = fuzzy_string; + char blk[100]; + memset(blk,'\0',sizeof(blk)); + unsigned long long blocksize = 0; + int i = 0; + while(*tmp_str != '\0' && *tmp_str != ':' && str_len != 0 && i < 100) + { + blk[i++] = *tmp_str; + tmp_str++; + str_len--; + } + blocksize = (unsigned long long)atoi(blk); + return blocksize; +} +int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2) +{ + int edit_distance=0; + int conf=0; + edit_distance = edit_distn(str1, len1,str2,len2); + conf = 100-(edit_distance*100)/(len1 + len2); + return conf; +} + +int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2) +{ + int j = 0, t = 0; + unsigned long long query_blocksize = 0, index_blocksize = 0; + unsigned int query_real_length = 0, index_real_length = 0; + const char *query_gram_begin = sfh1; + const char *index_gram_begin = sfh2; + char *splice_str = (char *)malloc(sizeof(char)*len1); + memset(splice_str,'\0',len1); + char *spli_str_begin = splice_str; + int edit_distance = 0; + int ret = 0; + char *p = NULL; + int splice_len = 0; + + for(j = 0; j < 2; j++) + { + index_blocksize = get_blocksize_from_head(index_gram_begin, len2); + while((*index_gram_begin) != '\0') + { + if((*index_gram_begin) == ':') + { + index_gram_begin++; + break; + } + index_gram_begin++; + } + index_real_length = get_real_length(index_gram_begin, len2); + query_gram_begin = sfh1; + for(t = 0; t < 2; t++) + { + query_blocksize = get_blocksize_from_head(query_gram_begin, len1); + //printf("gram_begin:%c\n",*index_gram_begin); + //printf("gram_str:%s\n",index_gram_begin); + while((*query_gram_begin) != '\0') + { + if((*query_gram_begin) == ':') + { + query_gram_begin++; + break; + } + query_gram_begin++; + } + //printf("query_blocksize:%lld, index_blocksize:%lld\n",query_blocksize,index_blocksize); + //index_real_length = get_real_length(index_gram_begin, len1); + if(query_blocksize == index_blocksize) + { + while((*query_gram_begin) != '#' && (*query_gram_begin) != '\0') + { + p=strchr(query_gram_begin,'['); + if(p!=NULL) + { + query_real_length = p-query_gram_begin; + p=strchr(p,']'); + if(p != NULL && (*p) != '\0') + { + + memcpy(spli_str_begin,query_gram_begin,query_real_length); + spli_str_begin += query_real_length; + //edit_distance += edit_distn(query_gram_begin, query_real_length, index_gram_begin, index_real_length); + query_gram_begin = p+1; + } + else + { + break; + } + } + else + { + break; + } + } + splice_len = strnlen(splice_str,len1); + edit_distance = edit_distn(index_gram_begin, index_real_length, splice_str, splice_len); + //printf("query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance); + ret = 100-(edit_distance*100)/(index_real_length + splice_len); + //ret = (100*ret)/SPAM_LENGTH; + //ret = 100-ret; + //ret = 100 - (100*edit_distance)/(query_real_length); + free(splice_str); + return ret; + } + while(*query_gram_begin != '\0') + { + if(*query_gram_begin == '#') + { + query_gram_begin++; + break; + } + query_gram_begin++; + } + + } + while(*index_gram_begin != '\0') + { + if(*index_gram_begin == '#') + { + index_gram_begin++; + break; + } + index_gram_begin++; + } + } + //printf("no blocksize:query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance); + free(splice_str); + return 0; +} + + + + +int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *) handle; + int i = 0, j = 0; + unsigned int union_index = 0; + unsigned int gram_value = _handle->user_gram_value; + unsigned int query_actual_len = 0; + unsigned int union_size = UNION_INIT_SIZE; + unsigned int chunk_cnt = 0; + const char *fuzzy_string_begin = data; + unsigned int * id_union =(unsigned int *)calloc(union_size, sizeof(unsigned int)); + unsigned long long query_blocksize = 0; + unsigned int fuzzy_string_len = (unsigned int)data_len; + + if(_handle->input_format == GIE_INPUT_FORMAT_SFH) + { + for(j = 0;j < 2;j++) + { + query_blocksize = get_blocksize_from_head(fuzzy_string_begin, fuzzy_string_len); + if(query_blocksize == 0) + { + return 0; + } + query_actual_len += GIE_gram_with_position(_handle, query_blocksize, fuzzy_string_begin, &id_union, &union_index, &union_size, &chunk_cnt); + while(*fuzzy_string_begin != '#' && *fuzzy_string_begin != '\0') + { + fuzzy_string_begin++; + } + if(*fuzzy_string_begin == '#') + { + fuzzy_string_begin++; + } + } + } + else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN) + { + query_actual_len = fuzzy_string_len; + chunk_cnt = GIE_part_query(_handle, fuzzy_string_begin, 0, query_actual_len, &id_union, &union_index, &union_size, 0); + } + + if(union_index == 0) + { + free(id_union); + id_union = NULL; + return 0; + } + + qsort(id_union, union_index, sizeof(id_union[0]), GIE_cmp); + + unsigned int current_id = id_union[0]; + unsigned int * tmp_id = id_union; + unsigned int count = 0; + struct id_table_data * ret_tmp = NULL; + short conf = 0; + int ret_size = 0; + for(i = 0; i <= union_index; i++) + { + if( i == union_index || *tmp_id != current_id ) + { + ret_tmp = (struct id_table_data *) MESA_htable_search(_handle->id_table, \ + (const uchar *)(&(current_id)), sizeof(current_id)); + + if(ret_tmp == NULL) + { + break; + } + char * tmp_gram = ret_tmp->sfh; + int length = ret_tmp->sfh_length; + if(ret_tmp->gram_cnt == 0||chunk_cnt == 0) + { + conf = 0; + } + else + { + conf = (count*(query_actual_len-gram_value+1)*10)/(chunk_cnt*(ret_tmp->gram_cnt)); + } + + if(_handle->ED_reexamine == 1) + { + if(_handle->input_format == GIE_INPUT_FORMAT_SFH) + { + conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length); + } + else + { + conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length); + } + } + + if(conf >= ret_tmp->cfds_lvl) + { + results[ret_size].cfds_lvl = conf; + results[ret_size].id = current_id; + /*results[ret_size].tag = (char *)malloc((ret_tmp->sfh_length + 1)*sizeof(char)); + memset(results[ret_size].tag,'\0',(ret_tmp->sfh_length+1)); + memcpy(results[ret_size].tag, ret_tmp->sfh,ret_tmp->sfh_length);*/ + results[ret_size].tag = ret_tmp->tag; + ret_size++; + } + + if(ret_size == result_size) + { + break; + } + + current_id = *tmp_id; + count = 1; + + } + else + { + count++; + } + + tmp_id ++; + } + + free(id_union); + id_union = NULL; + return ret_size; +} + + +unsigned long long GIE_status(GIE_handle_t * handle, int type) +{ + unsigned long long length; + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)handle; + switch(type) + { + case MEM_OCCUPY: + length = _handle->mem_occupy; + break; + default: + return 0; + } + return length; +} + diff --git a/src/get_td_mistake_lost/new_TD.conf b/src/get_td_mistake_lost/new_TD.conf new file mode 100644 index 0000000..be9301e --- /dev/null +++ b/src/get_td_mistake_lost/new_TD.conf @@ -0,0 +1,3 @@ +[file] +ripe_files_address = ../data/ripe_data/td_data_20171207/new_TD.txt +raw_file_address = ../data/ripe_data/td_data_20171207/all_av_digest diff --git a/src/get_td_mistake_lost/new_TD.py b/src/get_td_mistake_lost/new_TD.py new file mode 100644 index 0000000..5b7269f --- /dev/null +++ b/src/get_td_mistake_lost/new_TD.py @@ -0,0 +1,34 @@ +#-*-coding:utf-8-*- +import re +import random +import ConfigParser +import bisect +import commands +import os +import hashlib + +config = ConfigParser.RawConfigParser() +config.read("file_digest.conf") +raw_file_address=config.get("new_td","raw_file_address") +ripe_files_address=config.get("new_td","ripe_files_address") +print ("%s %s" %(raw_file_address,ripe_files_address)) + +def get_md5_value(td_string): + my_md5 = hashlib.md5() + my_md5.update(td_string) + my_md5_string=str(my_md5.hexdigest()) + return my_md5_string + +i=0 +with open(raw_file_address,'r') as infile: + with open(ripe_files_address,'w')as outfile: + for line in infile: + i+=1 + if(i%100000==0): + print i; + data_line_val = re.split(r';',line) + data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data_line_val[4]) + td_string=str("url"+data_set[1]+"MediaType:"+data_set[3]+"MediaLen:"+data_set[4] \ + +"Etag:"+data_set[5]+"LastModify:"+data_set[6]+"td_data_md5_32k:"+data_line_val[16]) + new_td=get_md5_value(td_string) + outfile.write(td_string+";"+new_td+";"+data_line_val[19]+"\n")
\ No newline at end of file |
