diff options
| author | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
|---|---|---|
| committer | 陈冠林 <[email protected]> | 2019-06-18 10:44:20 +0800 |
| commit | b2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch) | |
| tree | b7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/vedio_id_build.c | |
| parent | b026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff) | |
Diffstat (limited to 'src/dataset_build/vedio_id_build.c')
| -rw-r--r-- | src/dataset_build/vedio_id_build.c | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/src/dataset_build/vedio_id_build.c b/src/dataset_build/vedio_id_build.c new file mode 100644 index 0000000..9faaa64 --- /dev/null +++ b/src/dataset_build/vedio_id_build.c @@ -0,0 +1,171 @@ +/* +gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include "gram_index_engine.h" +#include <MESA/MESA_htable.h> +#include <assert.h> +#include <ctype.h> +#define BUFFER_LEN (10*1024) +#define SFH_PASS_RATE 0.9 +#define SFH_LEN (10*1024) +#define URL_LEN (10*1024) + +typedef struct video_id +{ + long id; + char *sfh; +}video_id; + +typedef struct cache +{ + GIE_digest_t ** GIE_cache; + long cache_size; + long len; +}cache; + +long get_hashed_len(const char* sfh) +{ + char *data=(char*)malloc(strlen(sfh)+1); + memcpy(data,sfh, strlen(sfh)); + data[strlen(sfh)]='\0'; + char *token=NULL,*sub_token=NULL,*saveptr; + long left_offset=0,right_offset=0,hashed_length=0; + int ret=0,first=0; + for (token = data; ;token= NULL) + { + sub_token= strtok_r(token,"[", &saveptr); + if (sub_token == NULL) + { + break; + } + if(first==0)//jump over the first sub string. + { + first=1; + continue; + } + ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset); + if(ret!=2) + { + return 0; + } + assert(ret==2); + hashed_length+=right_offset-left_offset+1; + } + //printf("hashed length=%ld\n",hashed_length); + free(data); + return hashed_length/2; +} + +int main(int argc,char *argv[]) +{ + FILE *video_id_sets_file; + FILE *new_sfh_file; + const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data"; + const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt"; + char *buffer=NULL; + int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0; + int update = 0,video_id = 0,j = 0; + int* temp_int = NULL; + float temp_sfh_pass = 0; + char *sfh_str,*url_str; + GIE_digest_t *sfh_video_id = NULL; + GIE_result_t *query_result = NULL; + cache *GIE_digest_cache = NULL; + video_id_sets_file = fopen(video_id_sets_file_dir,"r+"); + new_sfh_file = fopen(new_sfh_file_dir,"w"); + if(video_id_sets_file == NULL) + { + printf("open video_id_sets_file error\n"); + return -1; + } + if(new_sfh_file == NULL) + { + printf("open new_sfh_file error\n"); + return -1; + } + buffer = (char*)calloc(BUFFER_LEN,sizeof(char)); + GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t)); + query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t)); + GIE_handle_t *query_handle; + query_para->gram_value = 7; + query_para->position_accuracy = 5; + query_handle=GIE_create((const GIE_create_para_t *)query_para); + free(query_para); + if(query_handle==NULL) + { + printf("create GIE handle error\n"); + return -1; + } + sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_str = (char*)calloc(SFH_LEN,sizeof(char)); + url_str = (char*)calloc(URL_LEN,sizeof(char)); + i=0; + GIE_digest_cache =(cache*)calloc(1,sizeof(cache)); + GIE_digest_cache->cache_size = 1000; + GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*)); + GIE_digest_cache->len = 0; + while(feof(video_id_sets_file)==0) + { + i++; + if(i%10000==0) + { + printf("%d\n",i); + } + fgets(buffer,BUFFER_LEN-1,video_id_sets_file); + ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\ + %*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str); + if(ret!=2) + { + continue; + } + hashed_len = get_hashed_len((const char*)sfh_str); + temp_sfh_pass = (float)hashed_len/total_len; + if(temp_sfh_pass<SFH_PASS_RATE) + { + continue; + } + resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1); + if(resultnum == 0) + { + temp_int=(int*)calloc(1,sizeof(int)); + *temp_int=i; + sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t)); + sfh_video_id->id=i; + sfh_video_id->sfh_length=strlen(sfh_str); + sfh_video_id->operation=GIE_INSERT_OPT; + sfh_video_id->cfds_lvl=5; + sfh_video_id->sfh=strdup(sfh_str); + sfh_video_id->tag=temp_int; + GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id; + GIE_digest_cache->len++; + if(GIE_digest_cache->len==GIE_digest_cache->cache_size) + { + update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size); + GIE_digest_cache->len=0; + for(j=0;j<GIE_digest_cache->cache_size;j++) + { + free(GIE_digest_cache->GIE_cache[j]->sfh); + GIE_digest_cache->GIE_cache[j]->sfh=NULL; + free(GIE_digest_cache->GIE_cache[j]); + GIE_digest_cache->GIE_cache[j]=NULL; + } + } + fprintf(new_sfh_file,"%d,%s",i,buffer); + } + else + { + fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer); + } + } + free(buffer); + free(query_result); + free(sfh_video_id); + free(url_str); + free(sfh_str); + free(GIE_digest_cache); + return 0; +}
\ No newline at end of file |
