summaryrefslogtreecommitdiff
path: root/src/sfh_integrate/SFH_function.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/sfh_integrate/SFH_function.c')
-rw-r--r--src/sfh_integrate/SFH_function.c177
1 files changed, 177 insertions, 0 deletions
diff --git a/src/sfh_integrate/SFH_function.c b/src/sfh_integrate/SFH_function.c
new file mode 100644
index 0000000..a311f9c
--- /dev/null
+++ b/src/sfh_integrate/SFH_function.c
@@ -0,0 +1,177 @@
+/*
+gcc -g SFH_function.c -o SFH_function -lmaatframe -lMESA_htable -I../include
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define SLOT_SIZE (1024*1024*16)
+#define THREAD_SAFE 0
+#define BUFFER_LEN (10*1024)
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+
+typedef struct sfh_link
+{
+ char *sfh_str;
+ int similiar;
+ int all_similiar;
+ long hash_len;
+ struct sfh_link *next;
+}sfh_link;
+
+typedef struct top_similiar_sfh
+{
+ int all_num;
+ int all_similiar;
+ char *sfh_str;
+ long hash_len;
+ sfh_link *sfh_link_items;
+}top_similiar_sfh;
+
+long get_hashed_len(const char* sfh)
+{
+ char *data=(char*)malloc(strlen(sfh)+1);
+ memcpy(data,sfh, strlen(sfh));
+ data[strlen(sfh)]='\0';
+ char *token=NULL,*sub_token=NULL,*saveptr;
+ long left_offset=0,right_offset=0,hashed_length=0;
+ int ret=0,first=0;
+ for (token = data; ; token= NULL)
+ {
+ sub_token= strtok_r(token,"[", &saveptr);
+ if (sub_token == NULL)
+ {
+ break;
+ }
+ if(first==0)//jump over the first sub string.
+ {
+ first=1;
+ continue;
+ }
+ ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
+ if(ret!=2)
+ {
+ return 0;
+ }
+ assert(ret==2);
+ hashed_length+=right_offset-left_offset+1;
+ }
+ //printf("hashed length=%ld\n",hashed_length);
+ free(data);
+ return hashed_length/2;
+}
+
+void print_mistake_td(const uchar *key,uint size,void *data,void *arg)
+{
+ FILE *ripe_file = (FILE*)arg;
+ top_similiar_sfh *temp_top_similiar_sfh=(top_similiar_sfh*)data;
+ fprintf(ripe_file,"%s,%s\n",key,temp_top_similiar_sfh->sfh_str);
+ sfh_link *temp_sfh_link=temp_top_similiar_sfh->sfh_link_items;
+ for(;;temp_sfh_link=temp_sfh_link->next)
+ {
+ if(temp_sfh_link==NULL)
+ {
+ break;
+ }
+ fprintf(ripe_file,"%d;%s;%d\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->hash_len);
+ }
+ fprintf(ripe_file,"\n");
+}
+int main()
+{
+ FILE *raw_file;
+ FILE *ripe_file;
+ char *raw_file_dir="../data/td_data_set/td_data_20171207/video_id_raw_data";
+ char *ripe_file_dir="../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_2";
+ char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL;
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ long temp_hash_len=0;
+ unsigned int slot_size=SLOT_SIZE;
+ int i=0,thread_safe=THREAD_SAFE,ret=0,temp_similiar=0,temp_all_similiar=0;
+ top_similiar_sfh *temp_top_similiar_sfh=NULL;
+ sfh_link *temp_sfh_link=NULL;
+ MESA_htable_handle htable=NULL;
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+
+ if(ripe_file==NULL)
+ {
+ printf("open all_av_digest_mistake_level error");
+ return -1;
+ }
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ htable=MESA_htable_born();
+ MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
+ MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int));
+ MESA_htable_mature(htable);
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%10000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==2);
+ td[32]='\0';
+ if((temp_top_similiar_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ {
+ temp_top_similiar_sfh=(top_similiar_sfh*)calloc(1,sizeof(top_similiar_sfh));
+ temp_top_similiar_sfh->all_num=1;
+ temp_top_similiar_sfh->all_similiar=0;
+ temp_top_similiar_sfh->hash_len=get_hashed_len(sfh_str);
+ temp_top_similiar_sfh->sfh_str=strdup(sfh_str);
+ temp_top_similiar_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_top_similiar_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ temp_top_similiar_sfh->sfh_link_items->similiar=0;
+ temp_top_similiar_sfh->sfh_link_items->all_similiar=0;
+ temp_top_similiar_sfh->sfh_link_items->next=NULL;
+ ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_top_similiar_sfh);
+ assert(ret>0);
+ }
+ else
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_top_similiar_sfh->sfh_str,(int)strlen(temp_top_similiar_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_top_similiar_sfh->all_similiar+=temp_similiar;
+ temp_sfh_link=temp_top_similiar_sfh->sfh_link_items;
+ for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next)
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_sfh_link->all_similiar+=temp_similiar;
+ temp_all_similiar+=temp_similiar;
+ if(temp_sfh_link->all_similiar>temp_top_similiar_sfh->all_similiar)
+ {
+ free(temp_top_similiar_sfh->sfh_str);
+ temp_top_similiar_sfh->sfh_str=strdup(temp_sfh_link->sfh_str);
+ temp_top_similiar_sfh->all_similiar=temp_sfh_link->all_similiar;
+ }
+ if(temp_sfh_link->next==NULL)
+ {
+ break;
+ }
+ }
+ temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ temp_sfh_link->next->hash_len=get_hashed_len(sfh_str);
+ temp_sfh_link->next->similiar=0;
+ temp_sfh_link->next->all_similiar=temp_all_similiar;
+ temp_sfh_link->next->next=NULL;
+ temp_top_similiar_sfh->all_num+=1;
+ }
+ }
+ fclose(raw_file);
+ MESA_htable_iterate(htable,print_mistake_td,ripe_file);
+} \ No newline at end of file