summaryrefslogtreecommitdiff
path: root/src/dataset_build/vedio_id_build.c
diff options
context:
space:
mode:
author陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
committer陈冠林 <[email protected]>2019-06-18 10:44:20 +0800
commitb2a2f39d89b3bd154da10eb619f8a40c7c6b15d2 (patch)
treeb7a7d489030cfcc3b2fa878520d8c5d42dc5fce6 /src/dataset_build/vedio_id_build.c
parentb026525362d7f3b0ad58fb74362bf7f95ab515e8 (diff)
添加inc和srcHEADmaster
Diffstat (limited to 'src/dataset_build/vedio_id_build.c')
-rw-r--r--src/dataset_build/vedio_id_build.c171
1 files changed, 171 insertions, 0 deletions
diff --git a/src/dataset_build/vedio_id_build.c b/src/dataset_build/vedio_id_build.c
new file mode 100644
index 0000000..9faaa64
--- /dev/null
+++ b/src/dataset_build/vedio_id_build.c
@@ -0,0 +1,171 @@
+/*
+gcc -g vedio_id_build.c -o vedio_id_build -lmaatframe -I../../inc
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define BUFFER_LEN (10*1024)
+#define SFH_PASS_RATE 0.9
+#define SFH_LEN (10*1024)
+#define URL_LEN (10*1024)
+
+typedef struct video_id
+{
+ long id;
+ char *sfh;
+}video_id;
+
+typedef struct cache
+{
+ GIE_digest_t ** GIE_cache;
+ long cache_size;
+ long len;
+}cache;
+
+long get_hashed_len(const char* sfh)
+{
+ char *data=(char*)malloc(strlen(sfh)+1);
+ memcpy(data,sfh, strlen(sfh));
+ data[strlen(sfh)]='\0';
+ char *token=NULL,*sub_token=NULL,*saveptr;
+ long left_offset=0,right_offset=0,hashed_length=0;
+ int ret=0,first=0;
+ for (token = data; ;token= NULL)
+ {
+ sub_token= strtok_r(token,"[", &saveptr);
+ if (sub_token == NULL)
+ {
+ break;
+ }
+ if(first==0)//jump over the first sub string.
+ {
+ first=1;
+ continue;
+ }
+ ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
+ if(ret!=2)
+ {
+ return 0;
+ }
+ assert(ret==2);
+ hashed_length+=right_offset-left_offset+1;
+ }
+ //printf("hashed length=%ld\n",hashed_length);
+ free(data);
+ return hashed_length/2;
+}
+
+int main(int argc,char *argv[])
+{
+ FILE *video_id_sets_file;
+ FILE *new_sfh_file;
+ const char *video_id_sets_file_dir="../../data/td_data_set/td_data_20171207/video_id_raw_data";
+ const char *new_sfh_file_dir="../../data/ripe_data/td_data_20171207/video_id.txt";
+ char *buffer=NULL;
+ int ret = 0,hashed_len = 0,total_len = 0,resultnum = 0,i = 0;
+ int update = 0,video_id = 0,j = 0;
+ int* temp_int = NULL;
+ float temp_sfh_pass = 0;
+ char *sfh_str,*url_str;
+ GIE_digest_t *sfh_video_id = NULL;
+ GIE_result_t *query_result = NULL;
+ cache *GIE_digest_cache = NULL;
+ video_id_sets_file = fopen(video_id_sets_file_dir,"r+");
+ new_sfh_file = fopen(new_sfh_file_dir,"w");
+ if(video_id_sets_file == NULL)
+ {
+ printf("open video_id_sets_file error\n");
+ return -1;
+ }
+ if(new_sfh_file == NULL)
+ {
+ printf("open new_sfh_file error\n");
+ return -1;
+ }
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ GIE_create_para_t *query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t));
+ query_result = (GIE_result_t*)calloc(1,sizeof(GIE_result_t));
+ GIE_handle_t *query_handle;
+ query_para->gram_value = 7;
+ query_para->position_accuracy = 5;
+ query_handle=GIE_create((const GIE_create_para_t *)query_para);
+ free(query_para);
+ if(query_handle==NULL)
+ {
+ printf("create GIE handle error\n");
+ return -1;
+ }
+ sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ url_str = (char*)calloc(URL_LEN,sizeof(char));
+ i=0;
+ GIE_digest_cache =(cache*)calloc(1,sizeof(cache));
+ GIE_digest_cache->cache_size = 1000;
+ GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*));
+ GIE_digest_cache->len = 0;
+ while(feof(video_id_sets_file)==0)
+ {
+ i++;
+ if(i%10000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,video_id_sets_file);
+ ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ %*[^;];%*[^;];%*[^;];%[^;];%[^;]",sfh_str,url_str);
+ if(ret!=2)
+ {
+ continue;
+ }
+ hashed_len = get_hashed_len((const char*)sfh_str);
+ temp_sfh_pass = (float)hashed_len/total_len;
+ if(temp_sfh_pass<SFH_PASS_RATE)
+ {
+ continue;
+ }
+ resultnum=GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,1);
+ if(resultnum == 0)
+ {
+ temp_int=(int*)calloc(1,sizeof(int));
+ *temp_int=i;
+ sfh_video_id = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_video_id->id=i;
+ sfh_video_id->sfh_length=strlen(sfh_str);
+ sfh_video_id->operation=GIE_INSERT_OPT;
+ sfh_video_id->cfds_lvl=5;
+ sfh_video_id->sfh=strdup(sfh_str);
+ sfh_video_id->tag=temp_int;
+ GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_video_id;
+ GIE_digest_cache->len++;
+ if(GIE_digest_cache->len==GIE_digest_cache->cache_size)
+ {
+ update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size);
+ GIE_digest_cache->len=0;
+ for(j=0;j<GIE_digest_cache->cache_size;j++)
+ {
+ free(GIE_digest_cache->GIE_cache[j]->sfh);
+ GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ free(GIE_digest_cache->GIE_cache[j]);
+ GIE_digest_cache->GIE_cache[j]=NULL;
+ }
+ }
+ fprintf(new_sfh_file,"%d,%s",i,buffer);
+ }
+ else
+ {
+ fprintf(new_sfh_file,"%d,%s",*((int*)query_result->tag),buffer);
+ }
+ }
+ free(buffer);
+ free(query_result);
+ free(sfh_video_id);
+ free(url_str);
+ free(sfh_str);
+ free(GIE_digest_cache);
+ return 0;
+} \ No newline at end of file