summaryrefslogtreecommitdiff
path: root/src/get_td_mistake_lost/get_mistake_level.c
diff options
context:
space:
mode:
author陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
committer陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
commitb2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
treeb7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/get_td_mistake_lost/get_mistake_level.c
parentb026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
添加inc和srcHEADmaster
Diffstat (limited to 'src/get_td_mistake_lost/get_mistake_level.c')
-rw-r--r--src/get_td_mistake_lost/get_mistake_level.c366
1 files changed, 366 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/get_mistake_level.c b/src/get_td_mistake_lost/get_mistake_level.c
new file mode 100644
index 0000000..5f03974
--- /dev/null
+++ b/src/get_td_mistake_lost/get_mistake_level.c
@@ -0,0 +1,366 @@
+/*
+gcc -g get_mistake_level.c -o get_mistake_level -lMESA_htable -lmaatframe -I../../include
+./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define THREAD_SAFE 0
+#define SLOT_SIZE (1024*1024*16)
+#define SIMILIAR_RATE 90
+#define TD_STR_LEN (10*1024)
+#define TIME_STR_LEN 128
+#define RAODONG_RATE 0.1
+#define BUFFER_LEN (15*1024)
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+
+typedef struct sfh_link
+{
+ // char *time_str;
+ char *sfh_str;
+ char *td_ori;
+ // char *md5_32k;
+ int similiar;
+ int all_similiar;
+ // long hash_len;
+ struct sfh_link *next;
+}sfh_link;
+
+typedef struct mistake_sfh
+{
+ int mistake_num;
+ int all_num;
+ int all_similiar;
+ char *sfh_str;
+ // long hash_len;
+ sfh_link *sfh_link_items;
+}mistake_sfh;
+
+typedef struct temp_parameter
+{
+ int mistake_num;
+ FILE *ripe_file;
+}temp_parameter;
+
+long get_hashed_len(const char* sfh)
+{
+ char *data=(char*)malloc(strlen(sfh)+1);
+ memcpy(data,sfh, strlen(sfh));
+ data[strlen(sfh)]='\0';
+ char *token=NULL,*sub_token=NULL,*saveptr;
+ long left_offset=0,right_offset=0,hashed_length=0;
+ int ret=0,first=0;
+ for (token = data; ; token= NULL)
+ {
+ sub_token= strtok_r(token,"[", &saveptr);
+ if (sub_token == NULL)
+ {
+ break;
+ }
+ if(first==0)//jump over the first sub string.
+ {
+ first=1;
+ continue;
+ }
+ ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
+ if(ret!=2)
+ {
+ return 0;
+ }
+ assert(ret==2);
+ hashed_length+=right_offset-left_offset+1;
+ }
+ //printf("hashed length=%ld\n",hashed_length);
+ free(data);
+ return hashed_length/2;
+}
+
+void print_mistake_td(const uchar *key,uint size,void *data,void *arg)
+{
+ temp_parameter *parameter = (temp_parameter*)arg;
+ mistake_sfh *temp_mistake_sfh=(mistake_sfh*)data;
+ float temp_rate=0;
+ temp_rate=(float)temp_mistake_sfh->mistake_num/(float)temp_mistake_sfh->all_num;
+ if(temp_rate>RAODONG_RATE)
+ {
+ parameter->mistake_num+=temp_mistake_sfh->mistake_num;
+ fprintf(parameter->ripe_file,"%d;%s\n",temp_mistake_sfh->mistake_num,temp_mistake_sfh->sfh_str);
+ sfh_link *temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ for(;;temp_sfh_link=temp_sfh_link->next)
+ {
+ if(temp_sfh_link==NULL)
+ {
+ break;
+ }
+ temp_sfh_link->similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str));
+ // fprintf(parameter->ripe_file,"%s,%d;%s;%s;%s\n",temp_sfh_link->time_str,temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori,temp_sfh_link->md5_32k);
+ fprintf(parameter->ripe_file,"%d;%s;%s\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori);
+ }
+ fprintf(parameter->ripe_file,"\n");
+ }
+}
+
+int main(int argc,char *argv[])
+{
+ FILE *raw_file;
+ FILE *ripe_file;
+ char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt";
+ char *ripe_file_dir="../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_3";
+ char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL;
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0;
+ long temp_hash_len=0;
+ unsigned int slot_size=SLOT_SIZE;
+ mistake_sfh *temp_mistake_sfh=NULL;
+ sfh_link *temp_sfh_link=NULL;
+ MESA_htable_handle htable=NULL;
+ temp_parameter *parameter=NULL;
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+
+
+ if(ripe_file==NULL)
+ {
+ printf("open all_av_digest_mistake_level error");
+ return -1;
+ }
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ td_str = (char*)calloc(TD_STR_LEN,sizeof(char));
+ // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char));
+ // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char));
+ // time_str[TIME_STR_LEN-1]='\0';
+ // md5_32k_str[32]='\0';
+ htable=MESA_htable_born();
+ MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
+ MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int));
+ MESA_htable_mature(htable);
+ parameter=(temp_parameter*)calloc(1,sizeof(temp_parameter));
+ parameter->mistake_num=0;
+ parameter->ripe_file=ripe_file;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ // md5_32k_str[32]='\0';
+ if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ {
+ temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh));
+ temp_mistake_sfh->mistake_num=0;
+ temp_mistake_sfh->all_num=1;
+ temp_mistake_sfh->all_similiar=0;
+ // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str);
+ temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str);
+ // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str);
+ // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str);
+ temp_mistake_sfh->sfh_link_items->similiar=0;
+ temp_mistake_sfh->sfh_link_items->all_similiar=0;
+ temp_mistake_sfh->sfh_link_items->next=NULL;
+ ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh);
+ assert(ret>0);
+ }
+ else
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_mistake_sfh->all_similiar+=temp_similiar;
+ temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next)
+ {
+ // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE)
+ // {
+ // temp_mistake=1;
+ // }
+ temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_sfh_link->all_similiar+=temp_similiar;
+ temp_all_similiar+=temp_similiar;
+ if(temp_sfh_link->all_similiar>temp_mistake_sfh->all_similiar)
+ {
+ free(temp_mistake_sfh->sfh_str);
+ temp_mistake_sfh->sfh_str=strdup(temp_sfh_link->sfh_str);
+ temp_mistake_sfh->all_similiar=temp_sfh_link->all_similiar;
+ }
+ if(temp_sfh_link->next==NULL)
+ {
+ break;
+ }
+ }
+ // if(temp_hash_len>temp_mistake_sfh->hash_len)
+ // {
+ // temp_mistake_sfh->hash_len=temp_hash_len;
+ // free(temp_mistake_sfh->sfh_str);
+ // temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ // }
+ temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->time_str=strdup(time_str);
+ temp_sfh_link->next->similiar=0;
+ temp_sfh_link->next->all_similiar=temp_all_similiar;
+ temp_sfh_link->next->next=NULL;
+ temp_mistake_sfh->all_num+=1;
+ }
+ }
+ fclose(raw_file);
+ raw_file = fopen(raw_file_dir,"r+");
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+ i=0;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%10000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ // md5_32k_str[32]='\0';
+ temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN);
+ assert(temp_mistake_sfh!=NULL);
+ // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ // {
+ // temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh));
+ // temp_mistake_sfh->num=0;
+ // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str);
+ // temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ // temp_sfh_link=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->td_ori=strdup(td_str);
+ // temp_sfh_link->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->time_str=strdup(time_str);
+ // temp_sfh_link->next=NULL;
+ // temp_mistake_sfh->sfh_link_items=temp_sfh_link;
+ // ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh);
+ // assert(ret>0);
+ // }
+ // else
+ // {
+ // temp_hash_len=get_hashed_len(sfh_str);
+ // if(temp_hash_len>temp_mistake_sfh->hash_len)
+ // {
+ // temp_sfh_link->hash_len=get_hashed_len();
+ // free(temp_sfh_link->sfh_str);
+ // temp_sfh_link->sfh_str=strdup(sfh_str);
+ // }
+ temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ if(temp_similiar<SIMILIAR_RATE)
+ {
+ temp_mistake_sfh->mistake_num+=1;
+ }
+ // if(temp_mistake_sfh->sfh_link_items!=NULL)
+ // {
+ // temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ // for(;;temp_sfh_link=temp_sfh_link->next)
+ // {
+ // // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE)
+ // // {
+ // // temp_mistake=1;
+ // // }
+ // if(temp_sfh_link->next==NULL)
+ // {
+ // break;
+ // }
+ // }
+ // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->time_str=strdup(time_str);
+ // temp_sfh_link->next->similiar=temp_similiar;
+ // temp_sfh_link->next->next=NULL;
+ // }
+ // else
+ // {
+ // temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ // temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str);
+ // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str);
+ // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str);
+ // temp_mistake_sfh->sfh_link_items->similiar=temp_similiar;
+ // temp_mistake_sfh->sfh_link_items->next=NULL;
+ // }
+ // if(temp_mistake==1)
+ // {
+ // temp_mistake_sfh->num+=temp_mistake;
+ // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->next=NULL;
+ // temp_mistake=0;
+ // }
+ }
+ fclose(raw_file);
+ // raw_file=NULL;
+ // raw_file = fopen(raw_file_dir,"r+");
+ // if(raw_file==NULL)
+ // {
+ // printf("open all_av_digest error\n");
+ // return -1;
+ // }
+ // i=0;
+ // while(feof(raw_file)==0)
+ // {
+ // i++;
+ // if(i%10000==0)
+ // {
+ // printf("%d\n",i);
+ // }
+ // fgets(buffer,BUFFER_LEN-1,raw_file);
+ // buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%[^;];%*[^;];%*[^;];%*[^;]",td);
+ // assert(ret==1);
+ // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))!=NULL)
+ // {
+ // fprintf(ripe_file,"%d;%s",temp_mistake_sfh->num,buffer);
+ // }
+ // }
+ MESA_htable_iterate(htable,print_mistake_td,(void*)parameter);
+ printf("%d,%d\n",parameter->mistake_num,i);
+ free(buffer);
+ free(sfh_str);
+ free(td);
+ free(td_str);
+ // free(md5_32k_str);
+ MESA_htable_destroy(htable,NULL);
+ // fclose(raw_file);
+ fclose(ripe_file);
+ return 0;
+} \ No newline at end of file