diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/get_td_mistake_lost/get_lost_rate.c | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/get_td_mistake_lost/get_lost_rate.c')
| -rw-r--r-- | src/get_td_mistake_lost/get_lost_rate.c | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/get_lost_rate.c b/src/get_td_mistake_lost/get_lost_rate.c new file mode 100644 index 0000000..d983a00 --- /dev/null +++ b/src/get_td_mistake_lost/get_lost_rate.c @@ -0,0 +1,210 @@ +/* +gcc -g get_lost_rate.c -o get_lost_rate -lmaatframe -I../include +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <assert.h> +#include <ctype.h> +#define BUFFER_LEN (10*1024) +#define CACHE_SIZE 2000000 +#define SFH_LEN (10*1024) +#define TD_LEN 33 +#define RESULT_NUM 10000 +#define TIME_STR_LEN 128 +#define TD_STR_LEN (10*1024) + +typedef struct cache +{ + GIE_digest_t ** GIE_cache; + long cache_size; + long len; +}cache; + +typedef struct GIE_tag +{ + char *td; + char *td_str; + char *sfh_str; +}GIE_tag; + +int main() +{ + FILE *td_sfh_file; + FILE *raw_file; + FILE *ripe_file; + const char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt"; + const char *td_sfh_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_1"; + const char *ripe_file_dir="../../data/ripe_data/td_data_20171207/get_lost_ripe_data_1"; + td_sfh_file = fopen(td_sfh_file_dir,"r+"); + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + char *buffer=NULL,*sfh_str=NULL,*td=NULL,*time_str=NULL,*td_str=NULL; + GIE_create_para_t *query_para=NULL; + GIE_handle_t *query_handle=NULL; + GIE_result_t *query_result = NULL; + cache *GIE_digest_cache = NULL; + GIE_digest_t *sfh_td = NULL; + int i=0,w=0,ret=0,lost=0,j=0,update=0,resultnum=0,temp_len=0; + GIE_tag *temp_tag =NULL; + if(td_sfh_file == NULL) + { + printf("open td_sfh_file_dir error\n"); + return -1; + } + if(raw_file == NULL) + { + printf("open raw_file_dir error\n"); + return -1; + } + if(ripe_file == NULL) + { + printf("open ripe_file_dir error\n"); + return -1; + } + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + sfh_str[SFH_LEN-1]='\0'; + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + time_str = (char*)calloc(TIME_STR_LEN,sizeof(char)); + time_str[TIME_STR_LEN-1]='\0'; + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + td_str = (char*)calloc(TD_STR_LEN,sizeof(char)); + query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t)); + query_para->gram_value = 7; + query_para->position_accuracy = 5; + query_para->ED_reexamine=1; + query_para->format=GIE_INPUT_FORMAT_SFH; + query_handle=GIE_create((const GIE_create_para_t *)query_para); + free(query_para); + query_result = (GIE_result_t*)calloc(RESULT_NUM,sizeof(GIE_result_t)); + GIE_digest_cache =(cache*)calloc(1,sizeof(cache)); + GIE_digest_cache->cache_size = CACHE_SIZE; + GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*)); + GIE_digest_cache->len = 0; + if(query_handle==NULL) + { + printf("create GIE handle error\n"); + return -1; + } + while(feof(td_sfh_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,td_sfh_file); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td,td_str,sfh_str); + assert(ret==3); + td[32]='\0'; + sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_td->id=i; + temp_len=strlen(sfh_str); + sfh_td->sfh_length=temp_len; + sfh_str[temp_len-1]='\0'; + sfh_td->operation=GIE_INSERT_OPT; + sfh_td->cfds_lvl=5; + sfh_td->sfh=strdup(sfh_str); + temp_tag=(GIE_tag*)calloc(1,sizeof(GIE_tag)); + temp_tag->td=strdup(td); + temp_tag->td_str=strdup(td_str); + temp_tag->sfh_str=strdup(sfh_str); + sfh_td->tag=(void*)temp_tag; + GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td; + GIE_digest_cache->len++; + // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + // if(resultnum==0) + // { + // sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + // sfh_td->id=i; + // sfh_td->sfh_length=strlen(sfh_str); + // sfh_td->operation=GIE_INSERT_OPT; + // sfh_td->cfds_lvl=5; + // sfh_td->sfh=strdup(sfh_str); + // sfh_td->tag=(void*)strdup(td); + // GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td; + // GIE_digest_cache->len++; + // } + // else + // { + // for(j=0;j<resultnum;j++) + // { + // if(strcmp((char*)((query_result+j)->tag),td)!=0) + // { + // lost++; + // fprintf(ripe_file,"%s,%s,%s\n",(char*)((query_result+j)->tag),td,sfh_str); + // } + // } + // continue; + // } + // if(GIE_digest_cache->len==GIE_digest_cache->cache_size) + // { + // update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size); + // assert(update==GIE_digest_cache->len); + // GIE_digest_cache->len=0; + // for(j=0;j<GIE_digest_cache->cache_size;j++) + // { + // free(GIE_digest_cache->GIE_cache[j]->sfh); + // GIE_digest_cache->GIE_cache[j]->sfh=NULL; + // free(GIE_digest_cache->GIE_cache[j]); + // GIE_digest_cache->GIE_cache[j]=NULL; + // } + // } + // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + // for(i=0;i<resultnum;i++) + // { + // if(strcmp((char*)query_result[i]->tag,td)!=0) + // { + // lost++; + // } + // } + } + fclose(td_sfh_file); + update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->len); + for(j=0;j<GIE_digest_cache->len;j++) + { + free(GIE_digest_cache->GIE_cache[j]->sfh); + GIE_digest_cache->GIE_cache[j]->sfh=NULL; + free(GIE_digest_cache->GIE_cache[j]); + GIE_digest_cache->GIE_cache[j]=NULL; + } + i=0; + while(feof(raw_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + // ret=sscanf(buffer,"%[^;];%[^;]",td,sfh_str); + // assert(ret==2); + // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%[^;];%*[^;];%[^;];%*[^;]",td_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM); + if(resultnum>1) + { + for(j=0;j<resultnum;j++) + { + if(strcmp(((GIE_tag*)(query_result+j)->tag)->td,td)!=0) + { + w=1; + fprintf(ripe_file,"%u,%s,%s,%s,%s,%s,%s\n",(query_result+j)->id,((GIE_tag*)((query_result+j)->tag))->td_str,((GIE_tag*)((query_result+j)->tag))->td,((GIE_tag*)((query_result+j)->tag))->sfh_str,td_str,td,sfh_str); + } + } + lost+=w; + w=0; + } + + } + printf("%d;%d\n",lost,i); + free(sfh_str); + free(td); + free(time_str); + free(td_str); +}
\ No newline at end of file |
