diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/sfh_integrate/SFH_function.c | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/sfh_integrate/SFH_function.c')
| -rw-r--r-- | src/sfh_integrate/SFH_function.c | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/src/sfh_integrate/SFH_function.c b/src/sfh_integrate/SFH_function.c new file mode 100644 index 0000000..a311f9c --- /dev/null +++ b/src/sfh_integrate/SFH_function.c @@ -0,0 +1,177 @@ +/* +gcc -g SFH_function.c -o SFH_function -lmaatframe -lMESA_htable -I../include +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define SLOT_SIZE (1024*1024*16) +#define THREAD_SAFE 0 +#define BUFFER_LEN (10*1024) +#define SFH_LEN (10*1024) +#define TD_LEN 33 + +typedef struct sfh_link +{ + char *sfh_str; + int similiar; + int all_similiar; + long hash_len; + struct sfh_link *next; +}sfh_link; + +typedef struct top_similiar_sfh +{ + int all_num; + int all_similiar; + char *sfh_str; + long hash_len; + sfh_link *sfh_link_items; +}top_similiar_sfh; + +long get_hashed_len(const char* sfh) +{ + char *data=(char*)malloc(strlen(sfh)+1); + memcpy(data,sfh, strlen(sfh)); + data[strlen(sfh)]='\0'; + char *token=NULL,*sub_token=NULL,*saveptr; + long left_offset=0,right_offset=0,hashed_length=0; + int ret=0,first=0; + for (token = data; ; token= NULL) + { + sub_token= strtok_r(token,"[", &saveptr); + if (sub_token == NULL) + { + break; + } + if(first==0)//jump over the first sub string. + { + first=1; + continue; + } + ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); + if(ret!=2) + { + return 0; + } + assert(ret==2); + hashed_length+=right_offset-left_offset+1; + } + //printf("hashed length=%ld\n",hashed_length); + free(data); + return hashed_length/2; +} + +void print_mistake_td(const uchar *key,uint size,void *data,void *arg) +{ + FILE *ripe_file = (FILE*)arg; + top_similiar_sfh *temp_top_similiar_sfh=(top_similiar_sfh*)data; + fprintf(ripe_file,"%s,%s\n",key,temp_top_similiar_sfh->sfh_str); + sfh_link *temp_sfh_link=temp_top_similiar_sfh->sfh_link_items; + for(;;temp_sfh_link=temp_sfh_link->next) + { + if(temp_sfh_link==NULL) + { + break; + } + fprintf(ripe_file,"%d;%s;%d\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->hash_len); + } + fprintf(ripe_file,"\n"); +} +int main() +{ + FILE *raw_file; + FILE *ripe_file; + char *raw_file_dir="../data/td_data_set/td_data_20171207/video_id_raw_data"; + char *ripe_file_dir="../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_2"; + char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL; + raw_file = fopen(raw_file_dir,"r+"); + ripe_file = fopen(ripe_file_dir,"w+"); + long temp_hash_len=0; + unsigned int slot_size=SLOT_SIZE; + int i=0,thread_safe=THREAD_SAFE,ret=0,temp_similiar=0,temp_all_similiar=0; + top_similiar_sfh *temp_top_similiar_sfh=NULL; + sfh_link *temp_sfh_link=NULL; + MESA_htable_handle htable=NULL; + if(raw_file==NULL) + { + printf("open all_av_digest error\n"); + return -1; + } + + if(ripe_file==NULL) + { + printf("open all_av_digest_mistake_level error"); + return -1; + } + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + td = (char*)calloc(TD_LEN,sizeof(char)); + td[32]='\0'; + htable=MESA_htable_born(); + MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int)); + MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int)); + MESA_htable_mature(htable); + while(feof(raw_file)==0) + { + i++; + if(i%10000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,raw_file); + buffer[BUFFER_LEN-1]='\0'; + ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str); + assert(ret==2); + td[32]='\0'; + if((temp_top_similiar_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL) + { + temp_top_similiar_sfh=(top_similiar_sfh*)calloc(1,sizeof(top_similiar_sfh)); + temp_top_similiar_sfh->all_num=1; + temp_top_similiar_sfh->all_similiar=0; + temp_top_similiar_sfh->hash_len=get_hashed_len(sfh_str); + temp_top_similiar_sfh->sfh_str=strdup(sfh_str); + temp_top_similiar_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_top_similiar_sfh->sfh_link_items->sfh_str=strdup(sfh_str); + temp_top_similiar_sfh->sfh_link_items->similiar=0; + temp_top_similiar_sfh->sfh_link_items->all_similiar=0; + temp_top_similiar_sfh->sfh_link_items->next=NULL; + ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_top_similiar_sfh); + assert(ret>0); + } + else + { + temp_similiar=GIE_sfh_similiarity(temp_top_similiar_sfh->sfh_str,(int)strlen(temp_top_similiar_sfh->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_top_similiar_sfh->all_similiar+=temp_similiar; + temp_sfh_link=temp_top_similiar_sfh->sfh_link_items; + for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next) + { + temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str)); + temp_sfh_link->all_similiar+=temp_similiar; + temp_all_similiar+=temp_similiar; + if(temp_sfh_link->all_similiar>temp_top_similiar_sfh->all_similiar) + { + free(temp_top_similiar_sfh->sfh_str); + temp_top_similiar_sfh->sfh_str=strdup(temp_sfh_link->sfh_str); + temp_top_similiar_sfh->all_similiar=temp_sfh_link->all_similiar; + } + if(temp_sfh_link->next==NULL) + { + break; + } + } + temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link)); + temp_sfh_link->next->sfh_str=strdup(sfh_str); + temp_sfh_link->next->hash_len=get_hashed_len(sfh_str); + temp_sfh_link->next->similiar=0; + temp_sfh_link->next->all_similiar=temp_all_similiar; + temp_sfh_link->next->next=NULL; + temp_top_similiar_sfh->all_num+=1; + } + } + fclose(raw_file); + MESA_htable_iterate(htable,print_mistake_td,ripe_file); +}
\ No newline at end of file |
