summaryrefslogtreecommitdiff
path: root/src/get_td_mistake_lost
diff options
context:
space:
mode:
Diffstat (limited to 'src/get_td_mistake_lost')
-rw-r--r--src/get_td_mistake_lost/CMakeLists.txt11
-rw-r--r--src/get_td_mistake_lost/file_digest.conf6
-rw-r--r--src/get_td_mistake_lost/file_digest.py104
-rw-r--r--src/get_td_mistake_lost/get_TD_SFH.c162
-rw-r--r--src/get_td_mistake_lost/get_lost_rate.c210
-rw-r--r--src/get_td_mistake_lost/get_mistake_level.c366
-rw-r--r--src/get_td_mistake_lost/get_td_mistake_lost.sh5
-rw-r--r--src/get_td_mistake_lost/gram_index_engine.c1354
-rw-r--r--src/get_td_mistake_lost/new_TD.conf3
-rw-r--r--src/get_td_mistake_lost/new_TD.py34
10 files changed, 2255 insertions, 0 deletions
diff --git a/src/get_td_mistake_lost/CMakeLists.txt b/src/get_td_mistake_lost/CMakeLists.txt
new file mode 100644
index 0000000..87f4b6b
--- /dev/null
+++ b/src/get_td_mistake_lost/CMakeLists.txt
@@ -0,0 +1,11 @@
+PROJECT (CALCULATE)
+SET (SRC_LIST get_lost_rate.c)
+SET(CMAKE_BUILD_TYPE "Debug")
+SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb")
+SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
+MESSAGE(STATUS "This is BINARY dir" ${CALCULATE_BINARY_DIR})
+MESSAGE(STATUS "This is SOURCE dir" ${CALCULATE_SOURCE_DIR})
+#INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../include/)
+#LINK_DIRECTORIES(${PROJECT_SOURCE_DIR}/../../lib/)
+ADD_EXECUTABLE(get_lost_rate ${SRC_LIST} gram_index_engine.c)
+TARGET_LINK_LIBRARIES(get_lost_rate maatframe libMESA_htable.so pthread m)
diff --git a/src/get_td_mistake_lost/file_digest.conf b/src/get_td_mistake_lost/file_digest.conf
new file mode 100644
index 0000000..6d1c06b
--- /dev/null
+++ b/src/get_td_mistake_lost/file_digest.conf
@@ -0,0 +1,6 @@
+[file_digest]
+ripe_files_address = ../../data/ripe_data/td_data_20171207/all_av_digest
+raw_file_address = ../../data/td_data_20171207/td_data/all_av_digest
+[new_td]
+ripe_files_address = ../../data/ripe_data/td_data_20171207/new_TD.txt
+raw_file_address = ../../data/ripe_data/td_data_20171207/all_av_digest
diff --git a/src/get_td_mistake_lost/file_digest.py b/src/get_td_mistake_lost/file_digest.py
new file mode 100644
index 0000000..62786ef
--- /dev/null
+++ b/src/get_td_mistake_lost/file_digest.py
@@ -0,0 +1,104 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+class data_line(object):
+ """docstring for ClassName"""
+ def __init__(self):
+ super(ClassName, self).__init__()
+
+ @staticmethod
+ def if_error(data_line_str):
+ data_line_val = re.split(r';',data_line_str)
+ hashed_len = sfh_fingerprint.get_hashed_len(data_line_val[19])
+ if(term['data_num'](data_line_val) and \
+ term['not_null'](data_line_val[0]) and \
+ term['not_null'](data_line_val[1]) and \
+ term['not_null'](data_line_val[2]) and \
+ term['ysp_len'](data_line_val[3]) and \
+ term['not_null'](data_line_val[4]) and \
+ term['not_null'](data_line_val[5]) and \
+ term['td_len'](data_line_val[6]) and \
+ term['td_len'](data_line_val[7]) and \
+ term['td_len'](data_line_val[8]) and \
+ term['td_len'](data_line_val[9]) and \
+ term['td_len'](data_line_val[10]) and \
+ term['td_len'](data_line_val[11]) and \
+ term['td_len'](data_line_val[12]) and \
+ term['td_len'](data_line_val[13]) and \
+ term['td_len'](data_line_val[14]) and \
+ term['td_len'](data_line_val[15]) and \
+ term['td_len'](data_line_val[16]) and \
+ term['td_len'](data_line_val[17]) and \
+ term['not_null'](data_line_val[18]) and \
+ term['sfh_len'](data_line_val[19]) and \
+ term['not_null'](data_line_val[20]) and \
+ hashed_len/float(data_line_val[3])>0.999):
+ return data_line_val
+ else:
+ return -1
+
+class TD_fingerprint(object):
+ def __init__():
+ self.td = td
+ self.td_string = td_string
+ @staticmethod
+ def td_generate(td_string):
+ td_val = hashlib.md5(td_string,encode('utf-8')).hexdigest()
+
+class sfh_fingerprint(object):
+
+ def __init__(self,sfh):
+ self.sfh = sfh
+
+ @staticmethod
+ def get_hashed_len(sfh):
+ p = r"\[+\d+?:+\d+?\]"
+ pattern = re.compile(p)
+ hashed_len_set = pattern.findall(sfh)
+ if (term['not_null'](hashed_len_set)):
+ hashed_len = 0
+ for x in xrange(0,len(hashed_len_set)):
+ hashed_len_num = re.split(r"\[|\]|:",hashed_len_set[x])
+ hashed_len = hashed_len + int(hashed_len_num[2]) - int(hashed_len_num[1])
+ return hashed_len/len(hashed_len_set)
+ else :
+ return -1
+
+term = {'td_len':(lambda x : len(x)==32),
+ 'data_num':(lambda x : len(x)==21),
+ 'url':(lambda x : x.find['NUll']),
+ 'sfh_len':(lambda x : len(x)>20 and len(x)<(10*1024-100)),
+ 'not_null':(lambda x : len(x)!=0),
+ 'ysp_len':(lambda x : int(x)!=0),
+ 'line_len':(lambda x: len(x)>20 and len(x)<(10*1024-100))}
+
+c_func="./"
+ripe_files=[]
+config = ConfigParser.RawConfigParser()
+config.read("file_digest.conf")
+raw_file_address=config.get("file_digest","raw_file_address")
+ripe_files_address=config.get("file_digest","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+# num = [0,0,0,0,0,0,0]
+# breakpoints = [int(i) for i in config.get("output","breakpoints").split(",")]
+# i=0
+# for i in xrange(0,ripe_file_num):
+# outfile=open(ripe_files_address+str(i)+'.txt','w')
+# ripe_files.append(outfile)
+
+i=0
+with open(raw_file_address,'r') as infile:
+ with open(ripe_files_address,'w')as outfile:
+ for line in infile:
+ i+=1
+ if(i%10000==0):
+ print i
+ line_return = data_line.if_error(line)
+ if(line_return != -1):
+ outfile.write(str(line)) \ No newline at end of file
diff --git a/src/get_td_mistake_lost/get_TD_SFH.c b/src/get_td_mistake_lost/get_TD_SFH.c
new file mode 100644
index 0000000..2ed3ecd
--- /dev/null
+++ b/src/get_td_mistake_lost/get_TD_SFH.c
@@ -0,0 +1,162 @@
+/*
+gcc -g get_TD_SFH.c -o get_TD_SFH -lmaatframe -lMESA_htable -I../include
+./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define BUFFER_LEN (15*1024)
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+#define THREAD_SAFE 0
+#define SLOT_SIZE (1024*1024*16)
+#define TD_STR_LEN (10*1024)
+#define TIME_STR_LEN 128
+
+typedef struct sfh_link
+{
+ // char *time_str;
+ char *sfh_str;
+ char *td_ori;
+ // char *md5_32k;
+ int similiar;
+ int all_similiar;
+ // long hash_len;
+ struct sfh_link *next;
+}sfh_link;
+
+typedef struct sfh
+{
+ int all_num;
+ int all_similiar;
+ char *sfh_str;
+ // long hash_len;
+ sfh_link *sfh_link_items;
+}sfh;
+
+void print_td_sfh(const uchar *key,uint size,void *data,void *arg)
+{
+ FILE *ripe_file=(FILE*)arg;
+ sfh *temp_sfh=(sfh*)data;
+ fprintf(ripe_file,"%s;%s;%s",key,temp_sfh->sfh_link_items->td_ori,temp_sfh->sfh_str);
+}
+
+int main()
+{
+ FILE *raw_file;
+ FILE *ripe_file;
+ char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt";
+ char *ripe_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_3";
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+ if(ripe_file==NULL)
+ {
+ printf("open all_av_digest_mistake_level error");
+ return -1;
+ }
+ MESA_htable_handle htable=NULL;
+ char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL;
+ int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0;
+ unsigned int slot_size=SLOT_SIZE;
+ sfh *temp_sfh=NULL;
+ sfh_link *temp_sfh_link=NULL;
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ td_str = (char*)calloc(TD_STR_LEN,sizeof(char));
+ // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char));
+ // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char));
+ // time_str[TIME_STR_LEN-1]='\0';
+ // md5_32k_str[32]='\0';
+ htable=MESA_htable_born();
+ MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
+ MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int));
+ MESA_htable_mature(htable);
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str);
+ // assert(ret==5);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ // md5_32k_str[32]='\0';
+ if((temp_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ {
+ temp_sfh=(sfh*)calloc(1,sizeof(sfh));
+ temp_sfh->all_num=1;
+ temp_sfh->all_similiar=0;
+ temp_sfh->sfh_str=strdup(sfh_str);
+ temp_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ temp_sfh->sfh_link_items->td_ori=strdup(td_str);
+ // temp_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str);
+ // temp_sfh->sfh_link_items->time_str=strdup(time_str);
+ temp_sfh->sfh_link_items->similiar=0;
+ temp_sfh->sfh_link_items->all_similiar=0;
+ temp_sfh->sfh_link_items->next=NULL;
+ ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_sfh);
+ assert(ret>0);
+ }
+ else
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_sfh->sfh_str,(int)strlen(temp_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_sfh->all_similiar+=temp_similiar;
+ temp_sfh_link=temp_sfh->sfh_link_items;
+ for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next)
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_sfh_link->all_similiar+=temp_similiar;
+ temp_all_similiar+=temp_similiar;
+ if(temp_sfh_link->all_similiar>temp_sfh->all_similiar)
+ {
+ free(temp_sfh->sfh_str);
+ temp_sfh->sfh_str=strdup(temp_sfh_link->sfh_str);
+ temp_sfh->all_similiar=temp_sfh_link->all_similiar;
+ }
+ if(temp_sfh_link->next==NULL)
+ {
+ break;
+ }
+ }
+ temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->time_str=strdup(time_str);
+ temp_sfh_link->next->similiar=0;
+ temp_sfh_link->next->all_similiar=temp_all_similiar;
+ temp_sfh_link->next->next=NULL;
+ temp_sfh->all_num+=1;
+ }
+ }
+ fclose(raw_file);
+ MESA_htable_iterate(htable,print_td_sfh,ripe_file);
+ free(sfh_str);
+ free(td);
+ free(td_str);
+ // free(md5_32k_str);
+ MESA_htable_destroy(htable,NULL);
+ // fclose(raw_file);
+ fclose(ripe_file);
+ return 0;
+} \ No newline at end of file
diff --git a/src/get_td_mistake_lost/get_lost_rate.c b/src/get_td_mistake_lost/get_lost_rate.c
new file mode 100644
index 0000000..d983a00
--- /dev/null
+++ b/src/get_td_mistake_lost/get_lost_rate.c
@@ -0,0 +1,210 @@
+/*
+gcc -g get_lost_rate.c -o get_lost_rate -lmaatframe -I../include
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <assert.h>
+#include <ctype.h>
+#define BUFFER_LEN (10*1024)
+#define CACHE_SIZE 2000000
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+#define RESULT_NUM 10000
+#define TIME_STR_LEN 128
+#define TD_STR_LEN (10*1024)
+
+typedef struct cache
+{
+ GIE_digest_t ** GIE_cache;
+ long cache_size;
+ long len;
+}cache;
+
+typedef struct GIE_tag
+{
+ char *td;
+ char *td_str;
+ char *sfh_str;
+}GIE_tag;
+
+int main()
+{
+ FILE *td_sfh_file;
+ FILE *raw_file;
+ FILE *ripe_file;
+ const char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt";
+ const char *td_sfh_file_dir="../../data/ripe_data/td_data_20171207/TD_SFH_1";
+ const char *ripe_file_dir="../../data/ripe_data/td_data_20171207/get_lost_ripe_data_1";
+ td_sfh_file = fopen(td_sfh_file_dir,"r+");
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ char *buffer=NULL,*sfh_str=NULL,*td=NULL,*time_str=NULL,*td_str=NULL;
+ GIE_create_para_t *query_para=NULL;
+ GIE_handle_t *query_handle=NULL;
+ GIE_result_t *query_result = NULL;
+ cache *GIE_digest_cache = NULL;
+ GIE_digest_t *sfh_td = NULL;
+ int i=0,w=0,ret=0,lost=0,j=0,update=0,resultnum=0,temp_len=0;
+ GIE_tag *temp_tag =NULL;
+ if(td_sfh_file == NULL)
+ {
+ printf("open td_sfh_file_dir error\n");
+ return -1;
+ }
+ if(raw_file == NULL)
+ {
+ printf("open raw_file_dir error\n");
+ return -1;
+ }
+ if(ripe_file == NULL)
+ {
+ printf("open ripe_file_dir error\n");
+ return -1;
+ }
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ sfh_str[SFH_LEN-1]='\0';
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ time_str = (char*)calloc(TIME_STR_LEN,sizeof(char));
+ time_str[TIME_STR_LEN-1]='\0';
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ td_str = (char*)calloc(TD_STR_LEN,sizeof(char));
+ query_para = (GIE_create_para_t*)calloc(1,sizeof(GIE_create_para_t));
+ query_para->gram_value = 7;
+ query_para->position_accuracy = 5;
+ query_para->ED_reexamine=1;
+ query_para->format=GIE_INPUT_FORMAT_SFH;
+ query_handle=GIE_create((const GIE_create_para_t *)query_para);
+ free(query_para);
+ query_result = (GIE_result_t*)calloc(RESULT_NUM,sizeof(GIE_result_t));
+ GIE_digest_cache =(cache*)calloc(1,sizeof(cache));
+ GIE_digest_cache->cache_size = CACHE_SIZE;
+ GIE_digest_cache->GIE_cache = (GIE_digest_t**)calloc(GIE_digest_cache->cache_size,sizeof(GIE_digest_t*));
+ GIE_digest_cache->len = 0;
+ if(query_handle==NULL)
+ {
+ printf("create GIE handle error\n");
+ return -1;
+ }
+ while(feof(td_sfh_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,td_sfh_file);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td,td_str,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ sfh_td->id=i;
+ temp_len=strlen(sfh_str);
+ sfh_td->sfh_length=temp_len;
+ sfh_str[temp_len-1]='\0';
+ sfh_td->operation=GIE_INSERT_OPT;
+ sfh_td->cfds_lvl=5;
+ sfh_td->sfh=strdup(sfh_str);
+ temp_tag=(GIE_tag*)calloc(1,sizeof(GIE_tag));
+ temp_tag->td=strdup(td);
+ temp_tag->td_str=strdup(td_str);
+ temp_tag->sfh_str=strdup(sfh_str);
+ sfh_td->tag=(void*)temp_tag;
+ GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td;
+ GIE_digest_cache->len++;
+ // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ // if(resultnum==0)
+ // {
+ // sfh_td = (GIE_digest_t*)calloc(1,sizeof(GIE_digest_t));
+ // sfh_td->id=i;
+ // sfh_td->sfh_length=strlen(sfh_str);
+ // sfh_td->operation=GIE_INSERT_OPT;
+ // sfh_td->cfds_lvl=5;
+ // sfh_td->sfh=strdup(sfh_str);
+ // sfh_td->tag=(void*)strdup(td);
+ // GIE_digest_cache->GIE_cache[GIE_digest_cache->len] = sfh_td;
+ // GIE_digest_cache->len++;
+ // }
+ // else
+ // {
+ // for(j=0;j<resultnum;j++)
+ // {
+ // if(strcmp((char*)((query_result+j)->tag),td)!=0)
+ // {
+ // lost++;
+ // fprintf(ripe_file,"%s,%s,%s\n",(char*)((query_result+j)->tag),td,sfh_str);
+ // }
+ // }
+ // continue;
+ // }
+ // if(GIE_digest_cache->len==GIE_digest_cache->cache_size)
+ // {
+ // update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->cache_size);
+ // assert(update==GIE_digest_cache->len);
+ // GIE_digest_cache->len=0;
+ // for(j=0;j<GIE_digest_cache->cache_size;j++)
+ // {
+ // free(GIE_digest_cache->GIE_cache[j]->sfh);
+ // GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ // free(GIE_digest_cache->GIE_cache[j]);
+ // GIE_digest_cache->GIE_cache[j]=NULL;
+ // }
+ // }
+ // resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ // for(i=0;i<resultnum;i++)
+ // {
+ // if(strcmp((char*)query_result[i]->tag,td)!=0)
+ // {
+ // lost++;
+ // }
+ // }
+ }
+ fclose(td_sfh_file);
+ update=GIE_update(query_handle,GIE_digest_cache->GIE_cache,GIE_digest_cache->len);
+ for(j=0;j<GIE_digest_cache->len;j++)
+ {
+ free(GIE_digest_cache->GIE_cache[j]->sfh);
+ GIE_digest_cache->GIE_cache[j]->sfh=NULL;
+ free(GIE_digest_cache->GIE_cache[j]);
+ GIE_digest_cache->GIE_cache[j]=NULL;
+ }
+ i=0;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ // ret=sscanf(buffer,"%[^;];%[^;]",td,sfh_str);
+ // assert(ret==2);
+ // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%[^;];%*[^;];%[^;];%*[^;]",td_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ resultnum = GIE_query(query_handle,(const char *)sfh_str,(const long long)strlen(sfh_str),query_result,RESULT_NUM);
+ if(resultnum>1)
+ {
+ for(j=0;j<resultnum;j++)
+ {
+ if(strcmp(((GIE_tag*)(query_result+j)->tag)->td,td)!=0)
+ {
+ w=1;
+ fprintf(ripe_file,"%u,%s,%s,%s,%s,%s,%s\n",(query_result+j)->id,((GIE_tag*)((query_result+j)->tag))->td_str,((GIE_tag*)((query_result+j)->tag))->td,((GIE_tag*)((query_result+j)->tag))->sfh_str,td_str,td,sfh_str);
+ }
+ }
+ lost+=w;
+ w=0;
+ }
+
+ }
+ printf("%d;%d\n",lost,i);
+ free(sfh_str);
+ free(td);
+ free(time_str);
+ free(td_str);
+} \ No newline at end of file
diff --git a/src/get_td_mistake_lost/get_mistake_level.c b/src/get_td_mistake_lost/get_mistake_level.c
new file mode 100644
index 0000000..5f03974
--- /dev/null
+++ b/src/get_td_mistake_lost/get_mistake_level.c
@@ -0,0 +1,366 @@
+/*
+gcc -g get_mistake_level.c -o get_mistake_level -lMESA_htable -lmaatframe -I../../include
+./get_mistake_level ../data/ripe_data/td_data_20171207/all_av_digest_mistake_level
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "gram_index_engine.h"
+#include <MESA/MESA_htable.h>
+#include <assert.h>
+#include <ctype.h>
+#define THREAD_SAFE 0
+#define SLOT_SIZE (1024*1024*16)
+#define SIMILIAR_RATE 90
+#define TD_STR_LEN (10*1024)
+#define TIME_STR_LEN 128
+#define RAODONG_RATE 0.1
+#define BUFFER_LEN (15*1024)
+#define SFH_LEN (10*1024)
+#define TD_LEN 33
+
+typedef struct sfh_link
+{
+ // char *time_str;
+ char *sfh_str;
+ char *td_ori;
+ // char *md5_32k;
+ int similiar;
+ int all_similiar;
+ // long hash_len;
+ struct sfh_link *next;
+}sfh_link;
+
+typedef struct mistake_sfh
+{
+ int mistake_num;
+ int all_num;
+ int all_similiar;
+ char *sfh_str;
+ // long hash_len;
+ sfh_link *sfh_link_items;
+}mistake_sfh;
+
+typedef struct temp_parameter
+{
+ int mistake_num;
+ FILE *ripe_file;
+}temp_parameter;
+
+long get_hashed_len(const char* sfh)
+{
+ char *data=(char*)malloc(strlen(sfh)+1);
+ memcpy(data,sfh, strlen(sfh));
+ data[strlen(sfh)]='\0';
+ char *token=NULL,*sub_token=NULL,*saveptr;
+ long left_offset=0,right_offset=0,hashed_length=0;
+ int ret=0,first=0;
+ for (token = data; ; token= NULL)
+ {
+ sub_token= strtok_r(token,"[", &saveptr);
+ if (sub_token == NULL)
+ {
+ break;
+ }
+ if(first==0)//jump over the first sub string.
+ {
+ first=1;
+ continue;
+ }
+ ret=sscanf(sub_token,"%ld:%ld",&left_offset,&right_offset);
+ if(ret!=2)
+ {
+ return 0;
+ }
+ assert(ret==2);
+ hashed_length+=right_offset-left_offset+1;
+ }
+ //printf("hashed length=%ld\n",hashed_length);
+ free(data);
+ return hashed_length/2;
+}
+
+void print_mistake_td(const uchar *key,uint size,void *data,void *arg)
+{
+ temp_parameter *parameter = (temp_parameter*)arg;
+ mistake_sfh *temp_mistake_sfh=(mistake_sfh*)data;
+ float temp_rate=0;
+ temp_rate=(float)temp_mistake_sfh->mistake_num/(float)temp_mistake_sfh->all_num;
+ if(temp_rate>RAODONG_RATE)
+ {
+ parameter->mistake_num+=temp_mistake_sfh->mistake_num;
+ fprintf(parameter->ripe_file,"%d;%s\n",temp_mistake_sfh->mistake_num,temp_mistake_sfh->sfh_str);
+ sfh_link *temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ for(;;temp_sfh_link=temp_sfh_link->next)
+ {
+ if(temp_sfh_link==NULL)
+ {
+ break;
+ }
+ temp_sfh_link->similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str));
+ // fprintf(parameter->ripe_file,"%s,%d;%s;%s;%s\n",temp_sfh_link->time_str,temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori,temp_sfh_link->md5_32k);
+ fprintf(parameter->ripe_file,"%d;%s;%s\n",temp_sfh_link->similiar,temp_sfh_link->sfh_str,temp_sfh_link->td_ori);
+ }
+ fprintf(parameter->ripe_file,"\n");
+ }
+}
+
+int main(int argc,char *argv[])
+{
+ FILE *raw_file;
+ FILE *ripe_file;
+ char *raw_file_dir="../../data/ripe_data/td_data_20171207/new_TD.txt";
+ char *ripe_file_dir="../../data/ripe_data/td_data_20171207/all_av_digest_mistake_level_3";
+ char *sfh_str=NULL,*td=NULL,*buffer=NULL,*td_str=NULL,*md5_32k_str=NULL,*time_str=NULL;
+ raw_file = fopen(raw_file_dir,"r+");
+ ripe_file = fopen(ripe_file_dir,"w+");
+ int i=0,thread_safe=THREAD_SAFE,ret=0,temp_mistake=0,temp_similiar=0,temp_all_similiar=0;
+ long temp_hash_len=0;
+ unsigned int slot_size=SLOT_SIZE;
+ mistake_sfh *temp_mistake_sfh=NULL;
+ sfh_link *temp_sfh_link=NULL;
+ MESA_htable_handle htable=NULL;
+ temp_parameter *parameter=NULL;
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+
+
+ if(ripe_file==NULL)
+ {
+ printf("open all_av_digest_mistake_level error");
+ return -1;
+ }
+ buffer = (char*)calloc(BUFFER_LEN,sizeof(char));
+ sfh_str = (char*)calloc(SFH_LEN,sizeof(char));
+ td = (char*)calloc(TD_LEN,sizeof(char));
+ td[32]='\0';
+ td_str = (char*)calloc(TD_STR_LEN,sizeof(char));
+ // md5_32k_str = (char*)calloc(TD_LEN,sizeof(char));
+ // time_str = (char*)calloc(TIME_STR_LEN,sizeof(char));
+ // time_str[TIME_STR_LEN-1]='\0';
+ // md5_32k_str[32]='\0';
+ htable=MESA_htable_born();
+ MESA_htable_set_opt(htable,MHO_SCREEN_PRINT_CTRL,&thread_safe,sizeof(unsigned int));
+ MESA_htable_set_opt(htable,MHO_HASH_SLOT_SIZE,&slot_size,sizeof(unsigned int));
+ MESA_htable_mature(htable);
+ parameter=(temp_parameter*)calloc(1,sizeof(temp_parameter));
+ parameter->mistake_num=0;
+ parameter->ripe_file=ripe_file;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%100000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ // md5_32k_str[32]='\0';
+ if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ {
+ temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh));
+ temp_mistake_sfh->mistake_num=0;
+ temp_mistake_sfh->all_num=1;
+ temp_mistake_sfh->all_similiar=0;
+ // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str);
+ temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str);
+ // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str);
+ // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str);
+ temp_mistake_sfh->sfh_link_items->similiar=0;
+ temp_mistake_sfh->sfh_link_items->all_similiar=0;
+ temp_mistake_sfh->sfh_link_items->next=NULL;
+ ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh);
+ assert(ret>0);
+ }
+ else
+ {
+ temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_mistake_sfh->all_similiar+=temp_similiar;
+ temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ for(temp_all_similiar=0;;temp_sfh_link=temp_sfh_link->next)
+ {
+ // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE)
+ // {
+ // temp_mistake=1;
+ // }
+ temp_similiar=GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str));
+ temp_sfh_link->all_similiar+=temp_similiar;
+ temp_all_similiar+=temp_similiar;
+ if(temp_sfh_link->all_similiar>temp_mistake_sfh->all_similiar)
+ {
+ free(temp_mistake_sfh->sfh_str);
+ temp_mistake_sfh->sfh_str=strdup(temp_sfh_link->sfh_str);
+ temp_mistake_sfh->all_similiar=temp_sfh_link->all_similiar;
+ }
+ if(temp_sfh_link->next==NULL)
+ {
+ break;
+ }
+ }
+ // if(temp_hash_len>temp_mistake_sfh->hash_len)
+ // {
+ // temp_mistake_sfh->hash_len=temp_hash_len;
+ // free(temp_mistake_sfh->sfh_str);
+ // temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ // }
+ temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->time_str=strdup(time_str);
+ temp_sfh_link->next->similiar=0;
+ temp_sfh_link->next->all_similiar=temp_all_similiar;
+ temp_sfh_link->next->next=NULL;
+ temp_mistake_sfh->all_num+=1;
+ }
+ }
+ fclose(raw_file);
+ raw_file = fopen(raw_file_dir,"r+");
+ if(raw_file==NULL)
+ {
+ printf("open all_av_digest error\n");
+ return -1;
+ }
+ i=0;
+ while(feof(raw_file)==0)
+ {
+ i++;
+ if(i%10000==0)
+ {
+ printf("%d\n",i);
+ }
+ fgets(buffer,BUFFER_LEN-1,raw_file);
+ buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%[^;];%*[^;];%*[^;];%*[^;];%[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %[^;];%[^;];%*[^;];%[^;];%*[^;]",time_str,td_str,md5_32k_str,td,sfh_str);
+ ret=sscanf(buffer,"%[^;];%[^;];%[^;]",td_str,td,sfh_str);
+ assert(ret==3);
+ td[32]='\0';
+ // md5_32k_str[32]='\0';
+ temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN);
+ assert(temp_mistake_sfh!=NULL);
+ // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))==NULL)
+ // {
+ // temp_mistake_sfh=(mistake_sfh*)calloc(1,sizeof(mistake_sfh));
+ // temp_mistake_sfh->num=0;
+ // temp_mistake_sfh->hash_len=get_hashed_len(sfh_str);
+ // temp_mistake_sfh->sfh_str=strdup(sfh_str);
+ // temp_sfh_link=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->td_ori=strdup(td_str);
+ // temp_sfh_link->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->time_str=strdup(time_str);
+ // temp_sfh_link->next=NULL;
+ // temp_mistake_sfh->sfh_link_items=temp_sfh_link;
+ // ret=MESA_htable_add(htable,td,TD_LEN,(void *)temp_mistake_sfh);
+ // assert(ret>0);
+ // }
+ // else
+ // {
+ // temp_hash_len=get_hashed_len(sfh_str);
+ // if(temp_hash_len>temp_mistake_sfh->hash_len)
+ // {
+ // temp_sfh_link->hash_len=get_hashed_len();
+ // free(temp_sfh_link->sfh_str);
+ // temp_sfh_link->sfh_str=strdup(sfh_str);
+ // }
+ temp_similiar=GIE_sfh_similiarity(temp_mistake_sfh->sfh_str,(int)strlen(temp_mistake_sfh->sfh_str),sfh_str,(int)strlen(sfh_str));
+ if(temp_similiar<SIMILIAR_RATE)
+ {
+ temp_mistake_sfh->mistake_num+=1;
+ }
+ // if(temp_mistake_sfh->sfh_link_items!=NULL)
+ // {
+ // temp_sfh_link=temp_mistake_sfh->sfh_link_items;
+ // for(;;temp_sfh_link=temp_sfh_link->next)
+ // {
+ // // if(GIE_sfh_similiarity(temp_sfh_link->sfh_str,(int)strlen(temp_sfh_link->sfh_str),sfh_str,(int)strlen(sfh_str))<SIMILIAR_RATE)
+ // // {
+ // // temp_mistake=1;
+ // // }
+ // if(temp_sfh_link->next==NULL)
+ // {
+ // break;
+ // }
+ // }
+ // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->time_str=strdup(time_str);
+ // temp_sfh_link->next->similiar=temp_similiar;
+ // temp_sfh_link->next->next=NULL;
+ // }
+ // else
+ // {
+ // temp_mistake_sfh->sfh_link_items=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_mistake_sfh->sfh_link_items->sfh_str=strdup(sfh_str);
+ // temp_mistake_sfh->sfh_link_items->td_ori=strdup(td_str);
+ // temp_mistake_sfh->sfh_link_items->md5_32k=strdup(md5_32k_str);
+ // temp_mistake_sfh->sfh_link_items->time_str=strdup(time_str);
+ // temp_mistake_sfh->sfh_link_items->similiar=temp_similiar;
+ // temp_mistake_sfh->sfh_link_items->next=NULL;
+ // }
+ // if(temp_mistake==1)
+ // {
+ // temp_mistake_sfh->num+=temp_mistake;
+ // temp_sfh_link->next=(sfh_link*)calloc(1,sizeof(sfh_link));
+ // temp_sfh_link->next->sfh_str=strdup(sfh_str);
+ // temp_sfh_link->next->td_ori=strdup(td_str);
+ // temp_sfh_link->next->md5_32k=strdup(md5_32k_str);
+ // temp_sfh_link->next->next=NULL;
+ // temp_mistake=0;
+ // }
+ }
+ fclose(raw_file);
+ // raw_file=NULL;
+ // raw_file = fopen(raw_file_dir,"r+");
+ // if(raw_file==NULL)
+ // {
+ // printf("open all_av_digest error\n");
+ // return -1;
+ // }
+ // i=0;
+ // while(feof(raw_file)==0)
+ // {
+ // i++;
+ // if(i%10000==0)
+ // {
+ // printf("%d\n",i);
+ // }
+ // fgets(buffer,BUFFER_LEN-1,raw_file);
+ // buffer[BUFFER_LEN-1]='\0';
+ // ret=sscanf(buffer,"%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];%*[^;];\
+ // %*[^;];%[^;];%*[^;];%*[^;];%*[^;]",td);
+ // assert(ret==1);
+ // if((temp_mistake_sfh=MESA_htable_search(htable,td,TD_LEN))!=NULL)
+ // {
+ // fprintf(ripe_file,"%d;%s",temp_mistake_sfh->num,buffer);
+ // }
+ // }
+ MESA_htable_iterate(htable,print_mistake_td,(void*)parameter);
+ printf("%d,%d\n",parameter->mistake_num,i);
+ free(buffer);
+ free(sfh_str);
+ free(td);
+ free(td_str);
+ // free(md5_32k_str);
+ MESA_htable_destroy(htable,NULL);
+ // fclose(raw_file);
+ fclose(ripe_file);
+ return 0;
+} \ No newline at end of file
diff --git a/src/get_td_mistake_lost/get_td_mistake_lost.sh b/src/get_td_mistake_lost/get_td_mistake_lost.sh
new file mode 100644
index 0000000..7c851b8
--- /dev/null
+++ b/src/get_td_mistake_lost/get_td_mistake_lost.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+python new_TD.py
+./get_mistake_level
+./get_TD_SFH
+./get_lost_rate
diff --git a/src/get_td_mistake_lost/gram_index_engine.c b/src/get_td_mistake_lost/gram_index_engine.c
new file mode 100644
index 0000000..0f503db
--- /dev/null
+++ b/src/get_td_mistake_lost/gram_index_engine.c
@@ -0,0 +1,1354 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<math.h>
+#include<assert.h>
+#include<MESA/MESA_htable.h>
+#include<unistd.h>
+
+#include "gram_index_engine.h"
+#include "queue.h"
+
+#define HTABLE_SIZE 1024 *1024
+#define GRAM_CNT_MAX 2
+#define GRAM_MAX 128
+#define TOLERENCE_SIZE 0
+#define UNION_INIT_SIZE 1000
+#define BLOCKSIZE_MIN 3
+#define MEM_OCCUPY 1
+#define CNT_MAX 10
+#define GRAM_CNT_THRESHOLD 10
+#define QUERY_LEN_ACCURACY 0.1
+#define HTABLE_NUM 8
+//#define GIE_INPUT_FORMAT_SFH 1
+//#define GIE_INPUT_FORMAT_PLAIN 0
+#define MAX_LENGTH 10000
+#define KEY_MAX_LENGTH 10
+#define EDIT_DISTN_INSERT_COST 1
+#define EDIT_DISTN_REMOVE_COST 1
+#define EDIT_DISTN_REPLACE_COST 2
+#define MIN(x,y) ((x)<(y)?(x):(y))
+
+int before(unsigned int off1, unsigned int off2)
+{
+ return (signed int)(off1-off2)<0;
+}
+#define after(off2,off1) before(off1,off2)
+
+typedef struct
+{
+ unsigned int user_gram_value;
+ unsigned int user_position_accuracy;
+ short ED_reexamine;
+ short input_format;
+ MESA_htable_handle id_table;
+ MESA_htable_handle index_table[HTABLE_NUM];
+ unsigned long long mem_occupy;
+ unsigned long long hash_cnt;
+}GIE_handle_inner_t;
+
+
+struct linklist_node
+{
+ short * position;
+ struct id_table_data * basicinfo;
+ short size;
+ short index;
+ unsigned long long blocksize;
+ TAILQ_ENTRY(linklist_node) listentry;
+};
+
+
+struct index_table_data
+{
+ struct TQ * listhead;
+ int cnt;
+};
+
+
+struct id_table_data
+{
+ unsigned int id;
+ short sfh_length;
+ short gram_cnt;
+ unsigned long long blocksize;
+ char * sfh;
+ void * tag;
+ char cfds_lvl;
+};
+
+
+struct htable_handle
+{
+ MESA_htable_handle runtime_table;
+ MESA_htable_handle para;
+};
+
+struct key_list_node
+{
+ char * key;
+ int digest_id;
+ int pos;
+ unsigned long long blocksize;
+ TAILQ_ENTRY(key_list_node) keylistentry;
+};
+
+
+unsigned long long hash_cnt;
+unsigned long long cnt_sum;
+
+TAILQ_HEAD(TQ, linklist_node);
+TAILQ_HEAD(KL, key_list_node);
+
+void idtable_free(void * data);
+void indextable_free(void * data);
+int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2);
+int GIE_insert_indextable(MESA_htable_handle handle, struct id_table_data * info, char * key, unsigned int index,unsigned long long blocksize);
+
+int GIE_delete_from_indextable_by_key(MESA_htable_handle handle, char * key, unsigned int id);
+int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t * digest);
+int GIE_cmp(const void * a, const void * b);
+inline unsigned int get_real_length(const char * string, unsigned int length);
+void print_item_iterate(const uchar * key, unsigned int size, void * data, void * user);
+inline unsigned long long calc_fh_blocksize(unsigned long long orilen);
+inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len);
+
+MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data));
+void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user);
+void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user);
+
+GIE_handle_t * GIE_create(const GIE_create_para_t * para)
+{
+ int i = 0;
+ GIE_handle_inner_t * handle = (GIE_handle_inner_t *)calloc(1, sizeof(GIE_handle_inner_t));
+ handle->mem_occupy = 0;
+ handle->mem_occupy += sizeof(GIE_handle_inner_t);
+
+ handle->user_gram_value = para->gram_value;
+ handle->user_position_accuracy = para->position_accuracy;
+ handle->input_format = para->format;
+ //handle->user_cmp = GIE_INPUT_FORMAT_PLAIN;
+ handle->ED_reexamine = para->ED_reexamine;
+ handle->hash_cnt = 0;
+
+
+ MESA_htable_create_args_t idtable_args,indextable_args[HTABLE_NUM];
+ memset(&idtable_args, 0, sizeof(idtable_args));
+ idtable_args.thread_safe = 0;
+ idtable_args.hash_slot_size = HTABLE_SIZE;
+ idtable_args.max_elem_num = 0;
+ idtable_args.expire_time = 0;
+ idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
+ idtable_args.key_comp = NULL;
+ idtable_args.key2index = NULL;
+ idtable_args.data_free = idtable_free;
+ idtable_args.data_expire_with_condition = NULL;
+ idtable_args.recursive = 0;
+ handle->id_table = MESA_htable_create(&idtable_args, sizeof(idtable_args));
+
+ for(i = 0;i < HTABLE_NUM;i++)
+ {
+ memset(&indextable_args[i], 0, sizeof(indextable_args[i]));
+ indextable_args[i].thread_safe = 0;
+ indextable_args[i].hash_slot_size = HTABLE_SIZE;
+ indextable_args[i].max_elem_num = 0;
+ indextable_args[i].expire_time = 0;
+ indextable_args[i].eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
+ indextable_args[i].key_comp = key_compare;
+ indextable_args[i].key2index = NULL;
+ indextable_args[i].data_free = indextable_free;
+ indextable_args[i].data_expire_with_condition = NULL;
+ indextable_args[i].recursive = 0;
+ handle->index_table[i] = MESA_htable_create(&indextable_args[i], sizeof(indextable_args[i]));
+ }
+
+ return (GIE_handle_t *)(handle);
+}
+
+int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2)
+{
+ return ( (*(long*)key1) - (*(long*)key2));
+}
+
+
+void idtable_free(void * data)
+{
+ struct id_table_data * tmp = (struct id_table_data *)data;
+ free(tmp->sfh);
+ tmp->sfh = NULL;
+ tmp->tag = NULL;
+ free(tmp);
+ tmp = NULL;
+
+ return;
+}
+
+void indextable_delete_with_threshold(MESA_htable_handle * htable_handle, struct index_table_data * tmp, char * key)
+{
+ int key_length = strnlen(key,KEY_MAX_LENGTH);
+ struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
+ while(tmp_node != NULL)
+ {
+ struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node,listentry);
+ if(tmp_node->basicinfo->gram_cnt <= GRAM_CNT_THRESHOLD)
+ {
+ tmp_node = linklist_tmp;
+ continue;
+ }
+ TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
+ tmp_node->basicinfo->gram_cnt--;
+ tmp->cnt--;
+ if(TAILQ_EMPTY(tmp->listhead) == 1)
+ {
+ //_handle->hash_cnt--;
+ //_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ));
+ if(MESA_htable_del(htable_handle, (const uchar *)(key), key_length, indextable_free) < 0)
+ {
+ printf("indextable backtrack delete error!\n");
+ assert(0);
+ return;
+ }
+ }
+ //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp_node->size));
+ free(tmp_node->position);
+ tmp_node->position = NULL;
+ free(tmp_node);
+ tmp_node = NULL;
+ tmp_node = linklist_tmp;
+
+ }
+ return;
+}
+
+
+void indextable_free(void * data)
+{
+ struct index_table_data * tmp = (struct index_table_data *)data;
+ struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
+ while(tmp_node != NULL)
+ {
+ struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry);
+ TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
+ tmp->cnt--;
+ free(tmp_node->position);
+ tmp_node->position = NULL;
+ free(tmp_node);
+ tmp_node = NULL;
+ tmp_node = linklist_tmp;
+ }
+ free(tmp->listhead);
+ tmp->listhead = NULL;
+ free(tmp);
+ tmp = NULL;
+ return;
+}
+
+
+void indextable_free_cnt(void * data)
+{
+ struct index_table_data * tmp = (struct index_table_data *)data;
+ hash_cnt++;
+ cnt_sum += tmp->cnt;
+ struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
+ while(tmp_node != NULL)
+ {
+ struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry);
+ TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
+ tmp->cnt--;
+ free(tmp_node->position);
+ tmp_node->position = NULL;
+ free(tmp_node);
+ tmp_node = NULL;
+ tmp_node = linklist_tmp;
+ }
+ free(tmp->listhead);
+ tmp->listhead = NULL;
+ free(tmp);
+ tmp = NULL;
+ return;
+}
+
+void print_item_iterate_idtable(const uchar * key, uint size, void * data, void * user)
+{
+ struct id_table_data * id_data = (struct id_table_data *)data;
+ printf("id:%u\n",id_data->id);
+}
+
+
+
+void print_item_iterate(const uchar * key, uint size, void * data, void * user)
+{
+ struct index_table_data * index_data = (struct index_table_data *)data;
+ printf("%s %d\n", (char *)key, index_data->cnt);
+ struct linklist_node * tmp_node = NULL;
+ int i = 0;
+ TAILQ_FOREACH(tmp_node, index_data->listhead, listentry)
+ {
+ printf("id = %u\n",tmp_node->basicinfo->id);
+ printf("position is :\n");
+ for(i = 0;i < tmp_node->index;i++)
+ {
+ printf("%d ",tmp_node->position[i]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+}
+
+int edit_distn(const char *s1, int s1len, const char *s2, int s2len)
+{
+ long int max_len = 0;
+ if(s1len >= s2len)
+ {
+ max_len = s1len;
+ }
+ else
+ {
+ max_len = s2len;
+ }
+ int **t = (int **)malloc(2*sizeof(int *));
+ t[0] = (int *)malloc((max_len +1)*sizeof(int));
+ t[1] = (int *)malloc((max_len +1)*sizeof(int));
+ //int t[2][EDIT_DISTN_MAXLEN+1];
+ int *t1 = t[0];
+ int *t2 = t[1];
+ int *t3;
+ size_t i1, i2;
+ for (i2 = 0; i2 <= s2len; i2++)
+ t[0][i2] = i2 * EDIT_DISTN_REMOVE_COST;
+ for (i1 = 0; i1 < s1len; i1++) {
+ t2[0] = (i1 + 1) * EDIT_DISTN_INSERT_COST;
+ for (i2 = 0; i2 < s2len; i2++) {
+ int cost_a = t1[i2+1] + EDIT_DISTN_INSERT_COST;
+ int cost_d = t2[i2] + EDIT_DISTN_REMOVE_COST;
+ int cost_r = t1[i2] + (s1[i1] == s2[i2] ? 0 : EDIT_DISTN_REPLACE_COST);
+ t2[i2+1] = MIN(MIN(cost_a, cost_d), cost_r);
+ }
+ t3 = t1;
+ t1 = t2;
+ t2 = t3;
+ }
+ long int ret = t1[s2len];
+ free(t[0]);
+ free(t[1]);
+ free(t);
+ return ret;
+ //return t1[s2len];
+}
+
+
+void GIE_destory(GIE_handle_t * handle)
+{
+ GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
+ //printf("hash_cnt:%llu\n",_handle->hash_cnt);
+ //printf("mem_occupy:%llu\n",_handle->mem_occupy);
+ int i = 0;
+ for(i = 0;i < HTABLE_NUM;i++)
+ {
+ MESA_htable_destroy(_handle->index_table[i], indextable_free_cnt);
+ }
+ MESA_htable_destroy(_handle->id_table, idtable_free);
+ //printf("index_free hash_cnt :%llu\n", hash_cnt);
+ //printf("cnt sum :%llu\n",cnt_sum);
+ free(_handle);
+ _handle = NULL;
+}
+
+
+int grab_key_set(char * str_begin,short str_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list, unsigned long long blocksize)
+{
+ int k = 0,j = 0;
+ char * tmp_gram = str_begin;
+ char key[gram_value+1];
+ int sum = 0,htable_index = 0;
+ if(str_length < gram_value)
+ {
+ return 0;
+ }
+ str_length = MIN(str_length,strnlen(str_begin,str_length));
+ *gram_cnt = str_length - gram_value + 1;
+ //printf("str_length:%d\n",str_length);
+ for(k = 0; k < str_length - gram_value + 1; k++)
+ {
+ sum = 0;
+ memset(key,'\0', gram_value+1);
+ memcpy(key, tmp_gram++, gram_value);
+ //printf("k:%d key:%s\n",k,key);
+ for(j = 0; j < gram_value; j++)
+ {
+ sum += key[j];
+ }
+ htable_index = sum%HTABLE_NUM;
+ struct key_list_node *tmp_node = (struct key_list_node *)calloc(1,sizeof(struct key_list_node));
+ tmp_node->key = (char *)calloc(gram_value+1,sizeof(char));
+ memcpy(tmp_node->key,key,gram_value);
+ tmp_node->digest_id = i;
+ tmp_node->pos = k;
+ tmp_node->blocksize = blocksize;
+ TAILQ_INSERT_TAIL(to_process_list[htable_index], tmp_node, keylistentry);
+ }
+ return 1;
+}
+int sfh_grab_key_set(char *sfh,short sfh_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list)
+{
+ int t = 0;
+ char * tmp_gram = sfh;
+ unsigned long long blocksize = 0;
+ for(t = 0; t < 2;t++)
+ {
+ blocksize = get_blocksize_from_head(tmp_gram, sfh_length);
+ while(*tmp_gram != '\0')
+ {
+ if(*tmp_gram == ':')
+ {
+ tmp_gram++;
+ break;
+ }
+ tmp_gram++;
+ }
+ unsigned int real_length = get_real_length(tmp_gram, sfh_length);
+ if(real_length < gram_value)
+ {
+ if(t==0)
+ {
+ return 0;
+ }
+ else
+ {
+ continue;
+ }
+ }
+ grab_key_set(tmp_gram, real_length, i, gram_value, gram_cnt, to_process_list, blocksize);
+ while(*tmp_gram != '\0')
+ {
+ if(*tmp_gram == '#')
+ {
+ tmp_gram++;
+ break;
+ }
+ tmp_gram++;
+ }
+ }
+ return 1;
+}
+
+void free_key_set(struct KL ** to_process_list,int size)
+{
+ int i = 0;
+ for(i = 0;i < size;i++)
+ {
+ struct key_list_node *tmp_node = TAILQ_FIRST(to_process_list[i]);
+ while(tmp_node != NULL)
+ {
+ struct key_list_node *key_list_tmp = TAILQ_NEXT(tmp_node, keylistentry);
+ TAILQ_REMOVE(to_process_list[i], tmp_node, keylistentry);
+ free(tmp_node->key);
+ tmp_node->key = NULL;
+ free(tmp_node);
+ tmp_node = NULL;
+ tmp_node = key_list_tmp;
+ }
+ free(to_process_list[i]);
+ to_process_list[i]= NULL;
+ }
+}
+
+int GIE_update(GIE_handle_t * handle,GIE_digest_t * * digests,int size)
+{
+ GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
+ struct id_table_data * info = NULL;
+ int success_cnt = 0;
+ int m = 0, i = 0, grab_ret = 0;
+ short gram_cnt = 0;
+ unsigned int input_fh_len = 0;
+ unsigned int gram_value = _handle->user_gram_value;
+ struct KL* to_process_list[HTABLE_NUM];
+
+ MESA_htable_handle htable_index_copy;
+ MESA_htable_handle htable_id_copy;
+ MESA_htable_handle htable_tmp_index=NULL,htable_tmp_id=NULL;
+ struct htable_handle * htable_copied_id_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
+ struct htable_handle * htable_copied_index_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
+
+ htable_copied_id_para->runtime_table = _handle->id_table;
+ htable_copied_id_para->para = NULL;
+ htable_id_copy = copy_htable((void *)htable_copied_id_para, copy_idtable_item_iterate,idtable_free);
+
+ MESA_htable_handle garbage_htable[HTABLE_NUM];
+ /*if(MESA_htable_iterate(htable_id_copy, print_item_iterate_idtable, NULL) == -1)
+ {
+ printf("iterate error!\n");
+ }
+ printf("size:%u\n",id_size);*/
+
+ for(m = 0;m < HTABLE_NUM;m++)
+ {
+ to_process_list[m]=(struct KL*)calloc(1,sizeof(struct KL));
+ TAILQ_INIT(to_process_list[m]);
+ }
+
+ for(i = 0; i < size; i++)
+ {
+ switch(digests[i]->operation)
+ {
+ case GIE_INSERT_OPT:
+ {
+ assert(digests[i]->tag!=NULL);
+ if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
+ {
+ grab_ret = sfh_grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list);
+ }
+ else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
+ {
+
+ grab_ret = grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list,0);
+ }
+ if(grab_ret == 0)
+ {
+ continue;
+ }
+ else
+ {
+ info = (struct id_table_data *)calloc(1,sizeof(struct id_table_data));
+ input_fh_len = digests[i]->sfh_length;
+ info->sfh = (char *)calloc(input_fh_len + 1,sizeof(char));
+ memcpy(info->sfh, digests[i]->sfh, input_fh_len);
+ _handle->mem_occupy += sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1);
+ info->sfh_length = digests[i]->sfh_length;
+ info->gram_cnt = gram_cnt;
+
+ /*int tag_len = strnlen(digests[i]->tag,MAX_LENGTH);
+ info->tag = (char *)calloc(tag_len+1,sizeof(char));
+ memcpy(info->tag,digests[i]->tag,tag_len);*/
+ info->tag = digests[i]->tag;
+
+ info->id = digests[i]->id;
+ info->cfds_lvl = digests[i]->cfds_lvl;
+ if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
+ {
+ info->blocksize = get_blocksize_from_head(digests[i]->sfh, digests[i]->sfh_length);
+ }
+ else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
+ {
+ info->blocksize = 0;
+ }
+
+ if(MESA_htable_add(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), (const void *)info) < 0)
+ {
+ _handle->mem_occupy -= (sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1));
+ free(info->sfh);
+ info->sfh = NULL;
+ free(info);
+ info = NULL;
+ continue;
+ }
+ }
+ success_cnt ++;
+ break;
+ }
+
+ case GIE_DELETE_OPT:
+ {
+
+ struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(htable_id_copy, \
+ (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id));
+ if(ret!= NULL)
+ {
+ if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
+ {
+ success_cnt += sfh_grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list);
+ }
+ else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
+ {
+
+ success_cnt += grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list,0);
+ }
+ }
+ else
+ {
+ break;
+ }
+ if(MESA_htable_del(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), idtable_free) < 0)
+ {
+ printf("delete id failed!");
+ assert(0);
+ }
+ //success_cnt += GIE_delete(_handle, digests[i]);
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ }
+ unsigned int digest_id = 0;
+ struct id_table_data * tmp_info= NULL;
+
+ for(i = 0;i < HTABLE_NUM;i++)
+ {
+ htable_copied_index_para->runtime_table = _handle->index_table[i];
+ htable_copied_index_para->para = htable_id_copy;
+ htable_index_copy = copy_htable((void *)htable_copied_index_para,copy_indextable_item_iterate,indextable_free);
+ struct key_list_node * tmp_node;
+ TAILQ_FOREACH(tmp_node, to_process_list[i], keylistentry)
+ {
+ digest_id = tmp_node->digest_id;
+ if(digests[digest_id]->operation == GIE_INSERT_OPT)
+ {
+ tmp_info =(struct id_table_data *)MESA_htable_search(htable_id_copy, (const uchar *)(&(digests[digest_id])->id), \
+ sizeof((digests[digest_id])->id));
+ if(tmp_info == NULL)
+ {
+ printf("id %u not insert\n",digests[digest_id]->id);
+ }
+ if(GIE_insert_indextable(htable_index_copy, tmp_info, tmp_node->key, tmp_node->pos,tmp_node->blocksize) < 0)
+ {
+ printf("insert %d indextable failed!\n",digests[digest_id]->id);
+ continue;
+ }
+ }
+ else if(digests[digest_id]->operation == GIE_DELETE_OPT)
+ {
+ if(GIE_delete_from_indextable_by_key(htable_index_copy, tmp_node->key, (digests[digest_id])->id) < 0)
+ {
+ printf("delete %d indextable failed!\n",digests[digest_id]->id);
+ continue;
+ }
+ }
+ }
+ htable_tmp_index= _handle->index_table[i];
+ _handle->index_table[i] = htable_index_copy;
+ garbage_htable[i]=htable_tmp_index;
+ }
+
+ htable_tmp_id = _handle->id_table;
+ _handle->id_table = htable_id_copy;
+ usleep(200);
+ MESA_htable_destroy(htable_tmp_id, idtable_free);
+ /*if(MESA_htable_iterate(_handle->index_table, print_item_iterate, NULL) == -1)
+ {
+ printf("iterate error!\n");
+ }*/
+ for(i=0;i<HTABLE_NUM;i++)
+ {
+ MESA_htable_destroy(garbage_htable[i], indextable_free_cnt);
+
+ }
+ free_key_set(to_process_list,HTABLE_NUM);
+ free(htable_copied_id_para);
+ htable_copied_id_para = NULL;
+ free(htable_copied_index_para);
+ htable_copied_index_para = NULL;
+ return success_cnt;
+}
+
+
+MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data))
+{
+ MESA_htable_create_args_t copy_table_args;
+ memset(&copy_table_args, 0, sizeof(copy_table_args));
+ copy_table_args.thread_safe = 0;
+ copy_table_args.hash_slot_size = HTABLE_SIZE;
+ copy_table_args.max_elem_num = 0;
+ copy_table_args.expire_time = 0;
+ copy_table_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
+ copy_table_args.key_comp = NULL;
+ copy_table_args.key2index = NULL;
+ copy_table_args.data_free = free_fuc;
+ copy_table_args.data_expire_with_condition = NULL;
+ copy_table_args.recursive = 0;
+ MESA_htable_handle copy_htable_handle = MESA_htable_create(&copy_table_args, sizeof(copy_table_args));
+
+ struct htable_handle * htable_copied_para = (struct htable_handle *)htable_para;
+ struct htable_handle * htable_iterate_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
+ htable_iterate_para->runtime_table = copy_htable_handle;
+ htable_iterate_para->para = htable_copied_para->para;
+
+ if(MESA_htable_iterate(htable_copied_para->runtime_table, func, htable_iterate_para) == -1)
+ {
+ printf("iterate error!\n");
+ }
+ free(htable_iterate_para);
+ htable_copied_para=NULL;
+ return copy_htable_handle;
+}
+
+void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user)
+{
+ struct index_table_data * index_data = (struct index_table_data *)data;
+ struct htable_handle * htable_copied_para = (struct htable_handle *)user;
+
+ struct index_table_data * index_data_copy = (struct index_table_data *)calloc(1, sizeof(struct index_table_data));
+ struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ));
+ index_data_copy->listhead = head;
+ index_data_copy->cnt = index_data->cnt;
+
+ TAILQ_INIT(head);
+ struct linklist_node * tmp_node = NULL;
+ struct id_table_data * ret = NULL;
+ int i = 0;
+
+ TAILQ_FOREACH(tmp_node, index_data->listhead, listentry)
+ {
+ struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node));
+ node_data->size = tmp_node->size;
+ node_data->position = (short *)calloc(node_data->size, sizeof(short));
+ for(i = 0;i < tmp_node->index;i++)
+ {
+ node_data->position[i] = tmp_node->position[i];
+ }
+ ret = (struct id_table_data *)MESA_htable_search(htable_copied_para->para, (const uchar *)(&(tmp_node->basicinfo->id)), sizeof(tmp_node->basicinfo->id));
+ if(ret == NULL)
+ {
+ //printf("copy id %u not exist\n",tmp_node->basicinfo->id);
+ free(node_data->position);
+ node_data->position = NULL;
+ free(node_data);
+ node_data = NULL;
+ continue;
+ }
+ node_data->basicinfo = ret;
+ node_data->index = tmp_node->index;
+ node_data->blocksize = tmp_node->blocksize;
+ TAILQ_INSERT_TAIL(head, node_data, listentry);
+ }
+ MESA_htable_add(htable_copied_para->runtime_table, key, size, (const void *)index_data_copy);
+}
+//TODO: Using the orginal value instead of make a duplication to be faster.
+void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user)
+{
+ struct id_table_data * id_data = (struct id_table_data *)data;
+ struct htable_handle * htable_para = (struct htable_handle *)user;
+ struct id_table_data * id_data_copy = (struct id_table_data *)calloc(1, sizeof(struct id_table_data));
+ assert(id_data->tag!=NULL);
+ memcpy(id_data_copy,id_data,sizeof(struct id_table_data));
+ id_data_copy->sfh = (char *)calloc(id_data_copy->sfh_length,sizeof(char));
+ memcpy(id_data_copy->sfh,id_data->sfh,id_data_copy->sfh_length);
+
+ MESA_htable_add(htable_para->runtime_table, (const uchar *)(&(id_data_copy->id)), sizeof(id_data_copy->id), (const void *)id_data_copy);
+}
+
+
+
+
+int GIE_insert_indextable(MESA_htable_handle htable_copy, struct id_table_data * info, char * key, unsigned int index, unsigned long long blocksize)
+{
+ int key_length = strnlen(key,KEY_MAX_LENGTH);
+ struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node));
+ node_data->size = GRAM_CNT_MAX;
+ node_data->position = (short *)calloc(node_data->size, sizeof(short));
+ node_data->basicinfo = info;
+ node_data->index = 0;
+ node_data->position[(node_data->index)++] = index;
+ node_data->blocksize = blocksize;
+
+ //_handle->mem_occupy += sizeof(struct linklist_node) + sizeof(short)*(node_data->size);
+
+ struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable_copy, \
+ (const uchar *)(key), key_length));
+
+
+ if(ret != NULL)
+ {
+ struct linklist_node * tmp = NULL;
+ TAILQ_FOREACH(tmp, ret->listhead, listentry)
+ {
+ if(tmp->basicinfo->id > node_data->basicinfo->id)
+ {
+ TAILQ_INSERT_BEFORE(tmp, node_data, listentry);
+ ret->cnt ++;
+ if(ret->cnt >= CNT_MAX)
+ {
+ indextable_delete_with_threshold(htable_copy,ret,key);
+ }
+ return 0;
+ }
+ if(tmp->basicinfo->id == node_data->basicinfo->id && tmp->blocksize == blocksize)
+ {
+ if(tmp->index >= tmp->size)
+ {
+ tmp->size *= 2;
+ tmp->position = realloc(tmp->position, (tmp->size)*sizeof(short));
+ }
+ tmp->position[(tmp->index)++] = index;
+ //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(node_data->size));
+ free(node_data->position);
+ node_data->position = NULL;
+ free(node_data);
+ node_data = NULL;
+ return 0;
+ }
+ }
+ TAILQ_INSERT_TAIL(ret->listhead, node_data, listentry);
+ ret->cnt ++;
+ if(ret->cnt >= CNT_MAX)
+ {
+ indextable_delete_with_threshold(htable_copy,ret,key);
+ }
+ }
+
+ else
+ {
+ struct index_table_data * index_data = (struct index_table_data *)calloc(1, sizeof(struct index_table_data));
+ struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ));
+ //_handle->mem_occupy += sizeof(struct index_table_data) + sizeof(struct TQ);
+
+ index_data->listhead = head;
+ index_data->cnt = 0;
+
+ TAILQ_INIT(head);
+ TAILQ_INSERT_TAIL(head, node_data, listentry);
+ index_data->cnt++;
+ //_handle->hash_cnt++;
+ if(MESA_htable_add(htable_copy, (const uchar *)(key), key_length, (const void *)index_data) < 0)
+ {
+ printf("add index_table failed!\n");
+ assert(0);
+ return -1;
+ }
+ }
+ return 0;
+
+}
+
+
+
+int GIE_delete(GIE_handle_inner_t * _handle, GIE_digest_t * digest)
+{
+ int success_cnt = 0;
+ struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(_handle->id_table, \
+ (const uchar *)(&(digest->id)), sizeof(digest->id));
+ if(ret == NULL)
+ {
+ printf("del %d doesn't exist!\n",digest->id);
+ return -1;
+ }
+ else
+ {
+ int gram_value = _handle->user_gram_value;
+ char key[gram_value+1];
+ char * tmp_gram = ret->sfh;
+ while(*tmp_gram != '\0')
+ {
+ if(*tmp_gram == ':')
+ {
+ tmp_gram++;
+ break;
+ }
+ tmp_gram++;
+ }
+ unsigned int real_length = get_real_length(tmp_gram, ret->sfh_length);
+ int gram_cnt = real_length - gram_value + 1;
+ int k = 0;
+ for(k = 0; k < gram_cnt; k++)
+ {
+ memset(key, '\0', gram_value+1);
+ memcpy(key, tmp_gram++, gram_value);
+ if(GIE_delete_from_indextable_by_key(_handle, key, digest->id) < 0)
+ {
+ printf("delete %d indextable failed!\n",digest->id);
+ continue;
+ }
+ }
+ success_cnt++;
+ }
+
+ return success_cnt;
+}
+
+
+
+int GIE_delete_from_indextable_by_key(MESA_htable_handle htable, char * key, unsigned int id)
+{
+ int key_length = strnlen(key,KEY_MAX_LENGTH);
+ struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable, \
+ (const uchar *)(key), key_length));
+ if(ret == NULL)
+ {
+ return 0;
+ }
+
+
+ struct linklist_node * tmp = TAILQ_FIRST(ret->listhead);
+ while(tmp != NULL)
+ {
+ struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp, listentry);
+ if(tmp->basicinfo->id != id)
+ {
+ tmp=linklist_tmp;
+ continue;
+ }
+ TAILQ_REMOVE(ret->listhead, tmp, listentry);
+ ret->cnt--;
+ //_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp->size));
+ free(tmp->position);
+ tmp->position = NULL;
+ free(tmp);
+ tmp = NULL;
+ if(TAILQ_EMPTY(ret->listhead) == 1)
+ {
+ //_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ));
+ int ret = MESA_htable_del(htable, (const uchar *)(key), key_length, indextable_free);
+ if(ret < 0)
+ {
+ printf("indextable backtrack delete error!\n");
+ assert(0);
+ return -1;
+ }
+
+ }
+ }
+ return 0;
+}
+
+
+
+
+int GIE_cmp(const void * a, const void * b)
+{
+ unsigned int tmp_a = *(unsigned int *)a;
+ unsigned int tmp_b = *(unsigned int *)b;
+ if(before(tmp_a, tmp_b))
+ {
+ return -1;
+ }
+ else if(after(tmp_a, tmp_b))
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
+inline unsigned int get_real_length(const char * string, unsigned int length)
+{
+ unsigned int ret = 0;
+ const char * tmp_str = string;
+ while(*tmp_str != '\0')
+ {
+ if(*tmp_str == '[')
+ {
+ break;
+ }
+ tmp_str++;
+ ret ++;
+ }
+ return ret;
+}
+
+
+inline int GIE_part_query(GIE_handle_inner_t * _handle, const char * query_string, int index_begin, int part_query_len,unsigned int ** id_union, unsigned int * union_index, unsigned int * union_size, unsigned long long blocksize)
+{
+ unsigned int gram_value = _handle->user_gram_value;
+
+ unsigned int real_length = part_query_len;
+ unsigned int chunk_count_max = 0;
+ if(real_length < gram_value)
+ {
+ return 0;
+ }
+ else
+ {
+ chunk_count_max = real_length/gram_value;
+ }
+ char key[gram_value+1];
+ struct index_table_data * ret = NULL;
+ struct linklist_node * tmp_node_t = NULL;
+
+ unsigned int position_accuracy = _handle->user_position_accuracy;
+
+ int i=0,j=0,k=0;
+ unsigned int tmp_min = 0;
+ int sum = 0, htable_index = 0;
+ for(i = index_begin; i < chunk_count_max + index_begin; i++)
+ {
+ sum = 0;
+ memset(key,'\0',gram_value+1);
+ memcpy(key, query_string, gram_value);
+ for(k = 0; k < gram_value; k++)
+ {
+ sum += key[k];
+ }
+ htable_index = sum%HTABLE_NUM;
+ ret = (struct index_table_data *) MESA_htable_search(_handle->index_table[htable_index], \
+ (const uchar *)(key), strnlen(key,gram_value));
+ query_string = query_string + gram_value;
+
+ if(ret ==NULL)
+ {
+ break;
+ }
+
+ tmp_node_t = NULL;
+ TAILQ_FOREACH(tmp_node_t, ret->listhead, listentry)
+ {
+ tmp_min = 0;
+ if(i*gram_value >= position_accuracy)
+ {
+ tmp_min = i*gram_value - position_accuracy;
+ }
+ for(j = 0; j < tmp_node_t->index; j++)
+ {
+ if((blocksize == tmp_node_t->basicinfo->blocksize) && (tmp_node_t->position[j] >= tmp_min) && (tmp_node_t->position[j] <= i*gram_value + position_accuracy))
+ //if(blocksize == tmp_node_t->basicinfo->blocksize)
+ {
+ if((*union_index) >= (*union_size))
+ {
+ *union_size = (*union_size) * 2;
+ *id_union = (unsigned int *)realloc(*id_union, (*union_size)*sizeof(unsigned int));
+ }
+ (*id_union)[(*union_index)] = tmp_node_t->basicinfo->id;
+ (*union_index)++;
+ break;
+ }
+ }
+ }
+ }
+ return chunk_count_max;
+}
+
+inline int GIE_gram_with_position(GIE_handle_inner_t * _handle, unsigned long long query_blocksize, const char * fuzzy_string, unsigned int ** id_union,
+ unsigned int * union_index,unsigned int * union_size, unsigned int * chunk_cnt)
+{
+ const char * tmpstr = fuzzy_string;
+ const char * query_string_begin;
+ unsigned long long blocksize = query_blocksize;
+ int part_query_len = 0;
+ int query_actual_len = 0;
+ while(*tmpstr != ':'&& *tmpstr != '\0')
+ {
+ tmpstr ++;
+ }
+ if(*tmpstr == ':')
+ {
+ tmpstr ++;
+ }
+ else
+ {
+ return 0;
+ }
+ query_string_begin = tmpstr;
+ char *p = NULL;
+
+ while((*query_string_begin) != '\0')
+ {
+ int left = 0;
+ int right = 0;
+ p=strchr(query_string_begin,'[');
+ if(p!=NULL)
+ {
+ part_query_len = p-query_string_begin;
+ int ret = sscanf(p,"[%d:%d]",&left,&right);
+ if(ret != 2)
+ {
+ break;
+ }
+ p=strchr(p,']');
+ if(p != NULL && (*p) != '\0')
+ {
+ int index_begin = (left/blocksize - TOLERENCE_SIZE > 0 ? (left/blocksize - TOLERENCE_SIZE) : 0);
+ (*chunk_cnt) += GIE_part_query(_handle,query_string_begin,index_begin, part_query_len,
+ id_union, union_index, union_size, blocksize);
+ query_actual_len += part_query_len;
+ query_string_begin = p+1;
+ }
+ else
+ {
+ break;
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ return query_actual_len;
+}
+
+inline unsigned long long calc_fh_blocksize(unsigned long long orilen)
+{
+ double tmp = orilen/(64 * BLOCKSIZE_MIN);
+ double index = floor(log(tmp)/log(2));
+ double tmp_t = pow(2,index);
+ unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN);
+ return blocksize;
+}
+
+inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len)
+{
+ const char * tmp_str = fuzzy_string;
+ char blk[100];
+ memset(blk,'\0',sizeof(blk));
+ unsigned long long blocksize = 0;
+ int i = 0;
+ while(*tmp_str != '\0' && *tmp_str != ':' && str_len != 0 && i < 100)
+ {
+ blk[i++] = *tmp_str;
+ tmp_str++;
+ str_len--;
+ }
+ blocksize = (unsigned long long)atoi(blk);
+ return blocksize;
+}
+int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2)
+{
+ int edit_distance=0;
+ int conf=0;
+ edit_distance = edit_distn(str1, len1,str2,len2);
+ conf = 100-(edit_distance*100)/(len1 + len2);
+ return conf;
+}
+
+int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2)
+{
+ int j = 0, t = 0;
+ unsigned long long query_blocksize = 0, index_blocksize = 0;
+ unsigned int query_real_length = 0, index_real_length = 0;
+ const char *query_gram_begin = sfh1;
+ const char *index_gram_begin = sfh2;
+ char *splice_str = (char *)malloc(sizeof(char)*len1);
+ memset(splice_str,'\0',len1);
+ char *spli_str_begin = splice_str;
+ int edit_distance = 0;
+ int ret = 0;
+ char *p = NULL;
+ int splice_len = 0;
+
+ for(j = 0; j < 2; j++)
+ {
+ index_blocksize = get_blocksize_from_head(index_gram_begin, len2);
+ while((*index_gram_begin) != '\0')
+ {
+ if((*index_gram_begin) == ':')
+ {
+ index_gram_begin++;
+ break;
+ }
+ index_gram_begin++;
+ }
+ index_real_length = get_real_length(index_gram_begin, len2);
+ query_gram_begin = sfh1;
+ for(t = 0; t < 2; t++)
+ {
+ query_blocksize = get_blocksize_from_head(query_gram_begin, len1);
+ //printf("gram_begin:%c\n",*index_gram_begin);
+ //printf("gram_str:%s\n",index_gram_begin);
+ while((*query_gram_begin) != '\0')
+ {
+ if((*query_gram_begin) == ':')
+ {
+ query_gram_begin++;
+ break;
+ }
+ query_gram_begin++;
+ }
+ //printf("query_blocksize:%lld, index_blocksize:%lld\n",query_blocksize,index_blocksize);
+ //index_real_length = get_real_length(index_gram_begin, len1);
+ if(query_blocksize == index_blocksize)
+ {
+ while((*query_gram_begin) != '#' && (*query_gram_begin) != '\0')
+ {
+ p=strchr(query_gram_begin,'[');
+ if(p!=NULL)
+ {
+ query_real_length = p-query_gram_begin;
+ p=strchr(p,']');
+ if(p != NULL && (*p) != '\0')
+ {
+
+ memcpy(spli_str_begin,query_gram_begin,query_real_length);
+ spli_str_begin += query_real_length;
+ //edit_distance += edit_distn(query_gram_begin, query_real_length, index_gram_begin, index_real_length);
+ query_gram_begin = p+1;
+ }
+ else
+ {
+ break;
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ splice_len = strnlen(splice_str,len1);
+ edit_distance = edit_distn(index_gram_begin, index_real_length, splice_str, splice_len);
+ //printf("query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance);
+ ret = 100-(edit_distance*100)/(index_real_length + splice_len);
+ //ret = (100*ret)/SPAM_LENGTH;
+ //ret = 100-ret;
+ //ret = 100 - (100*edit_distance)/(query_real_length);
+ free(splice_str);
+ return ret;
+ }
+ while(*query_gram_begin != '\0')
+ {
+ if(*query_gram_begin == '#')
+ {
+ query_gram_begin++;
+ break;
+ }
+ query_gram_begin++;
+ }
+
+ }
+ while(*index_gram_begin != '\0')
+ {
+ if(*index_gram_begin == '#')
+ {
+ index_gram_begin++;
+ break;
+ }
+ index_gram_begin++;
+ }
+ }
+ //printf("no blocksize:query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance);
+ free(splice_str);
+ return 0;
+}
+
+
+
+
+int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size)
+{
+ GIE_handle_inner_t * _handle = (GIE_handle_inner_t *) handle;
+ int i = 0, j = 0;
+ unsigned int union_index = 0;
+ unsigned int gram_value = _handle->user_gram_value;
+ unsigned int query_actual_len = 0;
+ unsigned int union_size = UNION_INIT_SIZE;
+ unsigned int chunk_cnt = 0;
+ const char *fuzzy_string_begin = data;
+ unsigned int * id_union =(unsigned int *)calloc(union_size, sizeof(unsigned int));
+ unsigned long long query_blocksize = 0;
+ unsigned int fuzzy_string_len = (unsigned int)data_len;
+
+ if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
+ {
+ for(j = 0;j < 2;j++)
+ {
+ query_blocksize = get_blocksize_from_head(fuzzy_string_begin, fuzzy_string_len);
+ if(query_blocksize == 0)
+ {
+ return 0;
+ }
+ query_actual_len += GIE_gram_with_position(_handle, query_blocksize, fuzzy_string_begin, &id_union, &union_index, &union_size, &chunk_cnt);
+ while(*fuzzy_string_begin != '#' && *fuzzy_string_begin != '\0')
+ {
+ fuzzy_string_begin++;
+ }
+ if(*fuzzy_string_begin == '#')
+ {
+ fuzzy_string_begin++;
+ }
+ }
+ }
+ else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
+ {
+ query_actual_len = fuzzy_string_len;
+ chunk_cnt = GIE_part_query(_handle, fuzzy_string_begin, 0, query_actual_len, &id_union, &union_index, &union_size, 0);
+ }
+
+ if(union_index == 0)
+ {
+ free(id_union);
+ id_union = NULL;
+ return 0;
+ }
+
+ qsort(id_union, union_index, sizeof(id_union[0]), GIE_cmp);
+
+ unsigned int current_id = id_union[0];
+ unsigned int * tmp_id = id_union;
+ unsigned int count = 0;
+ struct id_table_data * ret_tmp = NULL;
+ short conf = 0;
+ int ret_size = 0;
+ for(i = 0; i <= union_index; i++)
+ {
+ if( i == union_index || *tmp_id != current_id )
+ {
+ ret_tmp = (struct id_table_data *) MESA_htable_search(_handle->id_table, \
+ (const uchar *)(&(current_id)), sizeof(current_id));
+
+ if(ret_tmp == NULL)
+ {
+ break;
+ }
+ char * tmp_gram = ret_tmp->sfh;
+ int length = ret_tmp->sfh_length;
+ if(ret_tmp->gram_cnt == 0||chunk_cnt == 0)
+ {
+ conf = 0;
+ }
+ else
+ {
+ conf = (count*(query_actual_len-gram_value+1)*10)/(chunk_cnt*(ret_tmp->gram_cnt));
+ }
+
+ if(_handle->ED_reexamine == 1)
+ {
+ if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
+ {
+ conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length);
+ }
+ else
+ {
+ conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length);
+ }
+ }
+
+ if(conf >= ret_tmp->cfds_lvl)
+ {
+ results[ret_size].cfds_lvl = conf;
+ results[ret_size].id = current_id;
+ /*results[ret_size].tag = (char *)malloc((ret_tmp->sfh_length + 1)*sizeof(char));
+ memset(results[ret_size].tag,'\0',(ret_tmp->sfh_length+1));
+ memcpy(results[ret_size].tag, ret_tmp->sfh,ret_tmp->sfh_length);*/
+ results[ret_size].tag = ret_tmp->tag;
+ ret_size++;
+ }
+
+ if(ret_size == result_size)
+ {
+ break;
+ }
+
+ current_id = *tmp_id;
+ count = 1;
+
+ }
+ else
+ {
+ count++;
+ }
+
+ tmp_id ++;
+ }
+
+ free(id_union);
+ id_union = NULL;
+ return ret_size;
+}
+
+
+unsigned long long GIE_status(GIE_handle_t * handle, int type)
+{
+ unsigned long long length;
+ GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)handle;
+ switch(type)
+ {
+ case MEM_OCCUPY:
+ length = _handle->mem_occupy;
+ break;
+ default:
+ return 0;
+ }
+ return length;
+}
+
diff --git a/src/get_td_mistake_lost/new_TD.conf b/src/get_td_mistake_lost/new_TD.conf
new file mode 100644
index 0000000..be9301e
--- /dev/null
+++ b/src/get_td_mistake_lost/new_TD.conf
@@ -0,0 +1,3 @@
+[file]
+ripe_files_address = ../data/ripe_data/td_data_20171207/new_TD.txt
+raw_file_address = ../data/ripe_data/td_data_20171207/all_av_digest
diff --git a/src/get_td_mistake_lost/new_TD.py b/src/get_td_mistake_lost/new_TD.py
new file mode 100644
index 0000000..5b7269f
--- /dev/null
+++ b/src/get_td_mistake_lost/new_TD.py
@@ -0,0 +1,34 @@
+#-*-coding:utf-8-*-
+import re
+import random
+import ConfigParser
+import bisect
+import commands
+import os
+import hashlib
+
+config = ConfigParser.RawConfigParser()
+config.read("file_digest.conf")
+raw_file_address=config.get("new_td","raw_file_address")
+ripe_files_address=config.get("new_td","ripe_files_address")
+print ("%s %s" %(raw_file_address,ripe_files_address))
+
+def get_md5_value(td_string):
+ my_md5 = hashlib.md5()
+ my_md5.update(td_string)
+ my_md5_string=str(my_md5.hexdigest())
+ return my_md5_string
+
+i=0
+with open(raw_file_address,'r') as infile:
+ with open(ripe_files_address,'w')as outfile:
+ for line in infile:
+ i+=1
+ if(i%100000==0):
+ print i;
+ data_line_val = re.split(r';',line)
+ data_set = re.split(r"URL:|ServerIP:|MediaType:|MediaLen:|Etag:|LastModify:",data_line_val[4])
+ td_string=str("url"+data_set[1]+"MediaType:"+data_set[3]+"MediaLen:"+data_set[4] \
+ +"Etag:"+data_set[5]+"LastModify:"+data_set[6]+"td_data_md5_32k:"+data_line_val[16])
+ new_td=get_md5_value(td_string)
+ outfile.write(td_string+";"+new_td+";"+data_line_val[19]+"\n") \ No newline at end of file