summaryrefslogtreecommitdiff
path: root/src/get_td_mistake_lost/get_lost_rate.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/get_td_mistake_lost/get_lost_rate.c')
-rw-r--r--src/get_td_mistake_lost/get_lost_rate.c210
1 files changed, 210 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/get_lost_rate.c b/src/get_td_mistake_lost/get_lost_rate.c
new file mode 100644
index 0000000..d983a00
--- /dev/null
+++ b/src/get_td_mistake_lost/get_lost_rate.c
@@ -0,0 +1,210 @@
+/*
+gcc -g get_lost_rate.c -o get_lost_rate -lmaatframe -I../include
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <assert.h>
+#include <ctype.h>
+#define BUFFER_LEN (10*1024)
+#define CACHE_SIZE 2000000
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+#define RESULT_NUM 10000
+#define TIME_STR_LEN 128
+#define TD_STR_LEN (10*1024)
+
+typedef struct cache
+{
+ GIE_digest_t ** GIE_cache;
+ long cache_size;
+ long len;
+}cache;
+
+typedef struct GIE_tag
+{
+ char *td;
+ char *td_str;
+ char *sfh_str;
+}GIE_tag;
+
+int main()
+{
+ FILE *td_sfh_file;
+ FILE *raw_file;
+ FILE *ripe_file;
+ const char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt";
+ const char *td_sfh_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_1";
+ const char *ripe_file_dir="../../data/ripe_data/td_data_20171207/get_lost_ripe_data_1";
+ td_sfh_file = fopen(td_sfh_file_dir,"r+");
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ char *buffer=NULL,*sfh_str=NULL,*td=NULL,*time_str=NULL,*td_str=NULL;
+ GIE_create_para_t *query_para=NULL;
+ GIE_handle_t *query_handle=NULL;
+ GIE_result_t *query_result = NULL;
+ cache *GIE_digest_cache = NULL;
+ GIE_digest_t *sfh_td = NULL;
+ int i=0,w=0,ret=0,lost=0,j=0,update=0,resultnum=0,temp_len=0;
+ GIE_tag *temp_tag =NULL;
+ if(td_sfh_file == NULL)
+ {
+ printf("open td_sfh_file_dir error\n");
+ return -1;
+ }
+ if(raw_file == NULL)
+ {
+ printf("open raw_file_dir error\n");
+ return -1;
+ }
+ if(ripe_file == NULL)
+ {
+ printf("open ripe_file_dir error\n");
+ return -1;
+ }
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ sfh_str[SFH_LEN-1]='\0';
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ time_str = (char*)calloc(TIME_STR_LEN,sizeof(char));
+ time_str[TIME_STR_LEN-1]='\0';
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ td_str = (char*)calloc(TD_STR_LEN,sizeof(char));
+ query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t));
+ query_para->gram_value = 7;
+ query_para->position_accuracy = 5;
+ query_para->ED_reexamine=1;
+ query_para->format=GIE_INPUT_FORMAT_SFH;
+ query_handle=GIE_create((const GIE_create_para_t *)query_para);
+ free(query_para);
+ query_result = (GIE_result_t*)calloc(RESULT_NUM,sizeof(GIE_result_t));
+ GIE_digest_cache =(cache*)calloc(1,sizeof(cache));
+ GIE_digest_cache->cache_size = CACHE_SIZE;
+ GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*));
+ GIE_digest_cache->len = 0;
+ if(query_handle==NULL)
+ {
+ printf("create GIE handle error\n");
+ return -1;
+ }
+ while(feof(td_sfh_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,td_sfh_file);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td,td_str,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_td->id=i;
+ temp_len=strlen(sfh_str);
+ sfh_td->sfh_length=temp_len;
+ sfh_str[temp_len-1]='\0';
+ sfh_td->operation=GIE_INSERT_OPT;
+ sfh_td->cfds_lvl=5;
+ sfh_td->sfh=strdup(sfh_str);
+ temp_tag=(GIE_tag*)calloc(1,sizeof(GIE_tag));
+ temp_tag->td=strdup(td);
+ temp_tag->td_str=strdup(td_str);
+ temp_tag->sfh_str=strdup(sfh_str);
+ sfh_td->tag=(void*)temp_tag;
+ GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td;
+ GIE_digest_cache->len++;
+ // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ // if(resultnum==0)
+ // {
+ // sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ // sfh_td->id=i;
+ // sfh_td->sfh_length=strlen(sfh_str);
+ // sfh_td->operation=GIE_INSERT_OPT;
+ // sfh_td->cfds_lvl=5;
+ // sfh_td->sfh=strdup(sfh_str);
+ // sfh_td->tag=(void*)strdup(td);
+ // GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td;
+ // GIE_digest_cache->len++;
+ // }
+ // else
+ // {
+ // for(j=0;j<resultnum;j++)
+ // {
+ // if(strcmp((char*)((query_result+j)->tag),td)!=0)
+ // {
+ // lost++;
+ // fprintf(ripe_file,"%s,%s,%s\n",(char*)((query_result+j)->tag),td,sfh_str);
+ // }
+ // }
+ // continue;
+ // }
+ // if(GIE_digest_cache->len==GIE_digest_cache->cache_size)
+ // {
+ // update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size);
+ // assert(update==GIE_digest_cache->len);
+ // GIE_digest_cache->len=0;
+ // for(j=0;j<GIE_digest_cache->cache_size;j++)
+ // {
+ // free(GIE_digest_cache->GIE_cache[j]->sfh);
+ // GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ // free(GIE_digest_cache->GIE_cache[j]);
+ // GIE_digest_cache->GIE_cache[j]=NULL;
+ // }
+ // }
+ // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ // for(i=0;i<resultnum;i++)
+ // {
+ // if(strcmp((char*)query_result[i]->tag,td)!=0)
+ // {
+ // lost++;
+ // }
+ // }
+ }
+ fclose(td_sfh_file);
+ update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->len);
+ for(j=0;j<GIE_digest_cache->len;j++)
+ {
+ free(GIE_digest_cache->GIE_cache[j]->sfh);
+ GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ free(GIE_digest_cache->GIE_cache[j]);
+ GIE_digest_cache->GIE_cache[j]=NULL;
+ }
+ i=0;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ // ret=sscanf(buffer,"%[^;];%[^;]",td,sfh_str);
+ // assert(ret==2);
+ // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%[^;];%*[^;];%[^;];%*[^;]",td_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ if(resultnum>1)
+ {
+ for(j=0;j<resultnum;j++)
+ {
+ if(strcmp(((GIE_tag*)(query_result+j)->tag)->td,td)!=0)
+ {
+ w=1;
+ fprintf(ripe_file,"%u,%s,%s,%s,%s,%s,%s\n",(query_result+j)->id,((GIE_tag*)((query_result+j)->tag))->td_str,((GIE_tag*)((query_result+j)->tag))->td,((GIE_tag*)((query_result+j)->tag))->sfh_str,td_str,td,sfh_str);
+ }
+ }
+ lost+=w;
+ w=0;
+ }
+
+ }
+ printf("%d;%d\n",lost,i);
+ free(sfh_str);
+ free(td);
+ free(time_str);
+ free(td_str);
+} \ No newline at end of file