diff options
Diffstat (limited to 'src/get_td_mistake_lost/get_mistake_level.c')
| -rw-r--r-- | src/get_td_mistake_lost/get_mistake_level.c | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/get_mistake_level.c b/src/get_td_mistake_lost/get_mistake_level.c new file mode 100644 index 0000000..5f03974 --- /dev/null +++ b/src/get_td_mistake_lost/get_mistake_level.c @@ -0,0 +1,366 @@ +/* +gcc -g get_mistake_level.c -o get_mistake_level -lMESA_htable -lmaatframe -I../../include +./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define THREAD_SAFE 0 +#define SLOT_SIZE (1024*1024*16) +#define SIMILIAR_RATE 90 +#define TD_STR_LEN (10*1024) +#define TIME_STR_LEN 128 +#define RAODONG_RATE 0.1 +#define BUFFER_LEN (15*1024) +#define SFH_LEN (10*1024) +#define TD_LEN 33 + +typedef struct sfh_link +{ + // char *time_str; + char *sfh_str; + char *td_ori; + // char *md5_32k; + int similiar; + int all_similiar; + // long hash_len; + struct sfh_link *next; +}sfh_link; + +typedef struct mistake_sfh +{ + int mistake_num; + int all_num; + int all_similiar; + char *sfh_str; + // long hash_len; + sfh_link *sfh_link_items; +}mistake_sfh; + +typedef struct temp_parameter +{ + int mistake_num; + FILE *ripe_file; +}temp_parameter; + +long get_hashed_len(const char* sfh) +{ + char *data=(char*)malloc(strlen(sfh)+1); + memcpy(data,sfh, strlen(sfh)); + data[strlen(sfh)]='\0'; + char *token=NULL,*sub_token=NULL,*saveptr; + long left_offset=0,right_offset=0,hashed_length=0; + int ret=0,first=0; + for (token = data; ; token= NULL) + { + sub_token= strtok_r(token,"[", &saveptr); + if (sub_token == NULL) + { + break; + } + if(first==0)//jump over the first sub string. + { + first=1; + continue; + } + ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); + if(ret!=2) + { + return 0; + } + assert(ret==2); + hashed_length+=right_offset-left_offset+1; + } + //printf("hashed length=%ld\n",hashed_length); + free(data); + return hashed_length/2; +} + +void print_mistake_td(const uchar *key,uint size,void *data,void *arg) +{ + temp_parameter *parameter = (temp_parameter*)arg; + mistake_sfh *temp_mistake_sfh=(mistake_sfh*)data; + float temp_rate=0; + temp_rate=(float)temp_mistake_sfh->mistake_num/(float)temp_mistake_sfh->all_num; + if(temp_rate>RAODONG_RATE) + { + parameter->mistake_num+=temp_mistake_sfh->mistake_num; + fprintf(parameter->ripe_file,"%d;%s\n",temp_mistake_sfh->mistake_num,temp_mistake_sfh->sfh_str); + sfh_link *temp_sfh_link=temp_mistake_sfh->sfh_link_items; + for(;;temp_sfh_link=temp_sfh_link->next) + { + if(temp_sfh_link==NULL) + { + break; + } + temp_sfh_link->similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str)); + // fprintf(parameter->ripe_file,"%s,%d;%s;%s;%s\n",temp_sfh_link->time_str,temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori,temp_sfh_link->md5_32k); + fprintf(parameter->ripe_file,"%d;%s;%s\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori); + } + fprintf(parameter->ripe_file,"\n"); + } +} + +int main(int argc,char *argv[]) +{ + FILE *raw_file; + FILE *ripe_file; + char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt"; + char *ripe_file_dir="../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_3"; + char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL; + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0; + long temp_hash_len=0; + unsigned int slot_size=SLOT_SIZE; + mistake_sfh *temp_mistake_sfh=NULL; + sfh_link *temp_sfh_link=NULL; + MESA_htable_handle htable=NULL; + temp_parameter *parameter=NULL; + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + + + if(ripe_file==NULL) + { + printf("open all_av_digest_mistake_level error"); + return -1; + } + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + td_str = (char*)calloc(TD_STR_LEN,sizeof(char)); + // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char)); + // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char)); + // time_str[TIME_STR_LEN-1]='\0'; + // md5_32k_str[32]='\0'; + htable=MESA_htable_born(); + MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); + MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int)); + MESA_htable_mature(htable); + parameter=(temp_parameter*)calloc(1,sizeof(temp_parameter)); + parameter->mistake_num=0; + parameter->ripe_file=ripe_file; + while(feof(raw_file)==0) + { + i++; + if(i%100000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + td[32]='\0'; + // md5_32k_str[32]='\0'; + if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + { + temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh)); + temp_mistake_sfh->mistake_num=0; + temp_mistake_sfh->all_num=1; + temp_mistake_sfh->all_similiar=0; + // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str); + temp_mistake_sfh->sfh_str=strdup(sfh_str); + temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str); + // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str); + // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str); + temp_mistake_sfh->sfh_link_items->similiar=0; + temp_mistake_sfh->sfh_link_items->all_similiar=0; + temp_mistake_sfh->sfh_link_items->next=NULL; + ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh); + assert(ret>0); + } + else + { + temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_mistake_sfh->all_similiar+=temp_similiar; + temp_sfh_link=temp_mistake_sfh->sfh_link_items; + for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next) + { + // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE) + // { + // temp_mistake=1; + // } + temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_sfh_link->all_similiar+=temp_similiar; + temp_all_similiar+=temp_similiar; + if(temp_sfh_link->all_similiar>temp_mistake_sfh->all_similiar) + { + free(temp_mistake_sfh->sfh_str); + temp_mistake_sfh->sfh_str=strdup(temp_sfh_link->sfh_str); + temp_mistake_sfh->all_similiar=temp_sfh_link->all_similiar; + } + if(temp_sfh_link->next==NULL) + { + break; + } + } + // if(temp_hash_len>temp_mistake_sfh->hash_len) + // { + // temp_mistake_sfh->hash_len=temp_hash_len; + // free(temp_mistake_sfh->sfh_str); + // temp_mistake_sfh->sfh_str=strdup(sfh_str); + // } + temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_sfh_link->next->sfh_str=strdup(sfh_str); + temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->time_str=strdup(time_str); + temp_sfh_link->next->similiar=0; + temp_sfh_link->next->all_similiar=temp_all_similiar; + temp_sfh_link->next->next=NULL; + temp_mistake_sfh->all_num+=1; + } + } + fclose(raw_file); + raw_file = fopen(raw_file_dir,"r+"); + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + i=0; + while(feof(raw_file)==0) + { + i++; + if(i%10000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str); + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==3); + td[32]='\0'; + // md5_32k_str[32]='\0'; + temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN); + assert(temp_mistake_sfh!=NULL); + // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + // { + // temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh)); + // temp_mistake_sfh->num=0; + // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str); + // temp_mistake_sfh->sfh_str=strdup(sfh_str); + // temp_sfh_link=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->sfh_str=strdup(sfh_str); + // temp_sfh_link->td_ori=strdup(td_str); + // temp_sfh_link->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->time_str=strdup(time_str); + // temp_sfh_link->next=NULL; + // temp_mistake_sfh->sfh_link_items=temp_sfh_link; + // ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh); + // assert(ret>0); + // } + // else + // { + // temp_hash_len=get_hashed_len(sfh_str); + // if(temp_hash_len>temp_mistake_sfh->hash_len) + // { + // temp_sfh_link->hash_len=get_hashed_len(); + // free(temp_sfh_link->sfh_str); + // temp_sfh_link->sfh_str=strdup(sfh_str); + // } + temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + if(temp_similiar<SIMILIAR_RATE) + { + temp_mistake_sfh->mistake_num+=1; + } + // if(temp_mistake_sfh->sfh_link_items!=NULL) + // { + // temp_sfh_link=temp_mistake_sfh->sfh_link_items; + // for(;;temp_sfh_link=temp_sfh_link->next) + // { + // // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE) + // // { + // // temp_mistake=1; + // // } + // if(temp_sfh_link->next==NULL) + // { + // break; + // } + // } + // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->next->sfh_str=strdup(sfh_str); + // temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->time_str=strdup(time_str); + // temp_sfh_link->next->similiar=temp_similiar; + // temp_sfh_link->next->next=NULL; + // } + // else + // { + // temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + // temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str); + // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str); + // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str); + // temp_mistake_sfh->sfh_link_items->similiar=temp_similiar; + // temp_mistake_sfh->sfh_link_items->next=NULL; + // } + // if(temp_mistake==1) + // { + // temp_mistake_sfh->num+=temp_mistake; + // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + // temp_sfh_link->next->sfh_str=strdup(sfh_str); + // temp_sfh_link->next->td_ori=strdup(td_str); + // temp_sfh_link->next->md5_32k=strdup(md5_32k_str); + // temp_sfh_link->next->next=NULL; + // temp_mistake=0; + // } + } + fclose(raw_file); + // raw_file=NULL; + // raw_file = fopen(raw_file_dir,"r+"); + // if(raw_file==NULL) + // { + // printf("open all_av_digest error\n"); + // return -1; + // } + // i=0; + // while(feof(raw_file)==0) + // { + // i++; + // if(i%10000==0) + // { + // printf("%d\n",i); + // } + // fgets(buffer,BUFFER_LEN-1,raw_file); + // buffer[BUFFER_LEN-1]='\0'; + // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + // %*[^;];%[^;];%*[^;];%*[^;];%*[^;]",td); + // assert(ret==1); + // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))!=NULL) + // { + // fprintf(ripe_file,"%d;%s",temp_mistake_sfh->num,buffer); + // } + // } + MESA_htable_iterate(htable,print_mistake_td,(void*)parameter); + printf("%d,%d\n",parameter->mistake_num,i); + free(buffer); + free(sfh_str); + free(td); + free(td_str); + // free(md5_32k_str); + MESA_htable_destroy(htable,NULL); + // fclose(raw_file); + fclose(ripe_file); + return 0; +}
\ No newline at end of file |
