From b711b50d356ffc09569d4f11ba2a0cae41045510 Mon Sep 17 00:00:00 2001 From: chenzizhan Date: Tue, 9 Jul 2024 15:13:14 +0800 Subject: spread sketch wip --- src/tags/spread_sketch.c | 339 +++++++++++++++++++++++++++++++++++++++++++++++ src/tags/spread_sketch.h | 40 ++++++ 2 files changed, 379 insertions(+) create mode 100644 src/tags/spread_sketch.c create mode 100644 src/tags/spread_sketch.h diff --git a/src/tags/spread_sketch.c b/src/tags/spread_sketch.c new file mode 100644 index 0000000..1a44d6a --- /dev/null +++ b/src/tags/spread_sketch.c @@ -0,0 +1,339 @@ +#include +#include +#include +#include +#include +#include + +#include "xxhash/xxhash.h" +#include "uthash.h" + +#include "spread_sketch.h" +#include "exdata.h" + +/* +方案1,smart ptr。 +省内存,更符合对cell manager 类结构的期待。 +额外增加一个额外的管理smart ptr的结构体。不过,总体来说修改量不算大,比如merge和add操作基本保持原样,仅仅是对key 的malloc 和free 做修改。 +对dummy 特殊情况的支持更容易。 + +根据测试结果,如果是每次reset bucket 的时候都重置,误差都仅仅是微微增加,如果是每次所有sketch中的key 都失去索引才删除,这个影响只会更小。可以用。 +*/ + +/* +方案2,把exdata 放入bucket 里,每个bucket一份。 + +可以保留spread sketch 的风味,不会引入新的误差。 +根据实验情况,大概会多占用一半的内存,因为根据测试的经验,保存的key 总量是bucket 总数的2/3左右。哈希表本身的HH handle 也占内存,这部分反而可以节约回去。 +会让cell 的操作变得麻烦,无法借用老的cell manager 流程,get exdata 会得到一个exdata 的数组(可能多个),而非单独的一个,要对多个cell综合起来当一个cell 看。。修改量非常小,但是确实会影响代码本身的整洁度。 + +*/ +struct smart_ptr { // todo:entry + int ref_count; + void *exdata; + + char *key; + size_t key_len; + UT_hash_handle hh; +}; + +struct smart_ptr_table { + struct smart_ptr *table; +}; + +struct bucket { + struct smart_ptr *content; + uint32_t level; +}; + +struct spread_sketch { + int depth; + int width; + + exdata_new_cb new_fn; + exdata_free_cb free_fn; + exdata_merge_cb merge_fn; + exdata_reset_cb reset_fn; + exdata_copy_cb copy_fn; + + struct bucket *buckets; + struct group_table *groups; +}; + +static void *default_new_fn(void *arg) { + return NULL; +} +static void default_free_fn(void *exdata) { + return; +} +static void default_merge_fn(void *dest, void *src) { + return; +} +static void default_reset_fn(void *exdata) { + return; +} +static void *default_copy_fn(void *exdata) { + return exdata; +} +static inline bool key_equal(const char *key1, size_t key1_len, const char *key2, size_t key2_len) { + if (key1_len != key2_len) { + return false; + } + return memcmp(key1, key2, key1_len) == 0; +} +static inline char *key_dup(const char *key, size_t key_len) { + char *ret = malloc(key_len); + memcpy(ret, key, key_len); + return ret; +} + +const struct smart_ptr *smart_prt_table_get(struct smart_ptr_table *table, const char *key, size_t key_len, void *arg) { + struct smart_ptr *ret; + HASH_FIND(hh, table, key, key_len, ret); + + if (ret != NULL) { + ret->ref_count++; + } else { + ret = malloc(sizeof(struct smart_ptr)); + ret->ref_count = 1; + ret->key = key_dup(key, key_len); + ret->key_len = key_len; + if (arg == NULL) { + ret->exdata = NULL; + } else { + ret->exdata = table->new_fn(arg); + } + HASH_ADD_KEYPTR(hh, table->table, ret->key, ret->key_len, ret); + } + + return ret; +} + +int smart_prt_table_release(struct smart_ptr_table *table, const char *key, size_t key_len) { + struct smart_ptr *ret; + HASH_FIND(hh, table->table, key, key_len, ret); + if (ret == NULL) { + return -1; + } + + ret->ref_count--; + if (ret->ref_count == 0) { + HASH_DEL(table->table, ret); + table->free_fn(ret->exdata); + free(ret->key); + free(ret); + } + + return 0; +} + +void smart_prt_table_free(struct smart_ptr_table *table) { + struct smart_ptr *current, *tmp; + HASH_ITER(hh, table->table, current, tmp) { + HASH_DEL(table->table, current); + table->free_fn(current->exdata); + free(current->key); + free(current); + } + free(table); +} + +struct smart_ptr_table *smart_prt_table_new() { + struct smart_ptr_table *table = malloc(sizeof(struct smart_ptr_table)); + table->table = NULL; + + table->new_fn = default_new_fn; + table->free_fn = default_free_fn; + table->merge_fn = default_merge_fn; + table->reset_fn = default_reset_fn; + table->copy_fn = default_copy_fn; + return table; +} + +void get_parameter_recommendation(int max_super_spreader_number, int *depth_out, int *width_out) +{ + int logk = max_super_spreader_number >= 3200 ? 4 : 3; // lg3200 = 3.51,round up to 4 + *depth_out = logk; + + int w; + if (max_super_spreader_number <= 100) { + w = max_super_spreader_number * 3 /2; // * 1.5, when the number is small, we need more width + } else { + w = max_super_spreader_number + 50; // + 50: 100*1.5-100 = 50, make w=f(k) continuous + } + if (w < 40) { + w = 40; + } + *width_out = w; +} + +struct spread_sketch *spread_sketch_new(int expected_query_num) { + struct spread_sketch *pthis = malloc(sizeof(struct spread_sketch)); + get_parameter_recommendation(expected_query_num, &pthis->depth, &pthis->width); + + pthis->buckets = calloc(pthis->depth * pthis->width, sizeof(struct bucket)); + pthis->table = smart_prt_table_new(); + + return pthis; +} + +// return 0 if not added, return 1 if added +int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg) {// todo: entry, item + // uint64_t hash_identifier = XXH3_64bits_withSeed(identifier, identifier_length, 171); + uint32_t level = (uint32_t)__builtin_clzll(hash_identifier) + 1; + + // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf + // A technique from the hashing literature is to use two hash functions h1(x) and h2(x) to simulate additional hash functions of the form gi(x) = h1(x) + ih2(x) + // Assuming that the 128-bit xxhash function is perfect, we can view it as a combination of two 64-bit hash functions. + uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_length, 171); + uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32); + uint32_t hash_x2 = (uint32_t) hash_x_tmp; + + bool in_sketch = false; + for (int i = 0; i < ss->depth; i++) { + uint32_t hash_x = hash_x1 + i * hash_x2; + int bucket_idx = (hash_x % ss->width) + i * ss->width; + struct bucket *bucket = &ss->buckets[bucket_idx]; + + if (bucket->content != NULL && key_equal(bucket->content->key, bucket->content->key_len, key, key_length)) { + if (bucket->level < level) { + bucket->level = level; + } + in_sketch = true; + } else { + uint32_t true_level = bucket->content == NULL ? 0: bucket->level; + + if (true_level < level) { + struct smart_ptr *content_old = bucket->content; + smart_prt_table_release(ss->table, content_old->key, content_old->key_len, ss->free_fn); + struct smart_ptr *content_new = smart_prt_table_get(ss->table, key, key_length, arg); + bucket->content = content_new; + bucket->level = level; + + in_sketch = true; + } + } + } + + return in_sketch ? 1 : 0; +} + +void spread_sketch_free(struct spread_sketch *ss) { + smart_prt_table_free(ss->table, ss->free_fn); + free(ss->buckets); + free(ss); +} + +void spread_sketch_merge(struct spread_sketch *dst, const struct spread_sketch *src) +{ + assert(dst->depth == src->depth && dst->width == src->width); + + for (int i = 0; i < dst->depth * dst->width; i++) { + const struct bucket *bucket_src = &src->buckets[i]; + struct bucket *bucket_dst = &dst->buckets[i]; + + if (bucket_src->content == NULL) { + continue; + } + if (bucket_dst->content == NULL) { + bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL); + bucket_dst->level = bucket_src->level; + continue; + } + + if (key_equal(bucket_src->content->key, bucket_src->content->key_len, bucket_dst->content->key, bucket_dst->content->key_len)) { + if (bucket_src->level > bucket_dst->level) { + bucket_dst->level = bucket_src->level; + } + } else { + if (bucket_src->level > bucket_dst->level) { + smart_prt_table_release(dst->table, bucket_dst->content->key, bucket_dst->content->key_len); + bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL); + bucket_dst->level = bucket_src->level; + } + } + } + + struct smart_ptr *content_dest, *content_src, *tmp; + HASH_ITER(hh, dst->table->table, content_dest, tmp) { + HASH_FIND(hh, src->table->table, content_dest->key, content_dest->key_len, content_src); + if (content_src == NULL) { + continue; + } + + if (content_dest->exdata == NULL) { + content_dest->exdata = dst->table->copy_fn(content_src->exdata); + } else { + dst->table->merge_fn(content_dest->exdata, content_src->exdata); + } + } +} + +void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len) { + struct smart_ptr *content; + HASH_FIND(hh, ss->table->table, key, key_len, content); + if (content == NULL) { + return NULL; + } + return content->exdata; +} + +void spread_sketch_reset(struct spread_sketch *ss) { + for (int i = 0; i < ss->depth * ss->width; i++) { + struct bucket *bucket = &ss->buckets[i]; + bucket->level = 0; + } + + struct smart_ptr *content, *tmp; + HASH_ITER(hh, ss->table->table, content, tmp) { + ss->reset_fn(content->exdata); + } +} + +void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn) { + ss->table->new_fn = new_fn; + ss->table->free_fn = free_fn; + ss->table->merge_fn = merge_fn; + ss->table->reset_fn = reset_fn; + ss->table->copy_fn = copy_fn; + return 0; +} + +int spread_sketch_get_count(const struct spread_sketch *ss) { + return HASH_CNT(hh, ss->table->table); +} + +// 这个函数还是会忠实的把内部所有内容都拿出来,但是预期是最多expected_query_num 个,所以在外层要做一个排序。 +// 这个排序在这里面没法做,因为没有具体的hll计数。 +size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas) { + size_t count = 0; + struct smart_ptr *content, *tmp; + HASH_ITER(hh, ss->table->table, content, tmp) { + if (count >= n_exdatas) { + break; + } + exdatas[count] = content->exdata; + count++; + } + return count; +} + +struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src) { + struct spread_sketch *dst = malloc(sizeof(struct spread_sketch)); + memcpy(dst, src, sizeof(struct spread_sketch)); + + dst->buckets = calloc(dst->depth * dst->width, sizeof(struct bucket)); + dst->table = smart_prt_table_new(); + + for (int i = 0; i < dst->depth * dst->width; i++) { + if (src->buckets[i].content == NULL) { + continue; + } + dst->buckets[i].content = smart_prt_table_get(dst->table, src->buckets[i].content->key, src->buckets[i].content->key_len, NULL); + dst->buckets[i].level = src->buckets[i].level; + if (dst->buckets[i].content->exdata == NULL) { + dst->buckets[i].content->exdata = src->copy_fn(src->buckets[i].content->exdata); + } + } + return dst; +} diff --git a/src/tags/spread_sketch.h b/src/tags/spread_sketch.h new file mode 100644 index 0000000..f50cae4 --- /dev/null +++ b/src/tags/spread_sketch.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "exdata.h" + + +struct spread_sketch; + +// spread sketch alway store values more than expected_query_num,expected_query_num is a hint to set spread sketch parameters properly +struct spread_sketch *spread_sketch_new(int expected_query_num); + +void spread_sketch_free(struct spread_sketch *ss); + +void spread_sketch_reset(struct spread_sketch *ss); + +int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg); +void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn); + +void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len); + +// get the number of cells in the heavy keeper +int spread_sketch_get_count(const struct spread_sketch *ss); + +size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas); + +void spread_sketch_merge(struct spread_sketch *dest, const struct spread_sketch *src); + +struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src); + +// for test +// void spread_sketch_one_point_query(const struct spread_sketch *ss, const char *key, size_t key_len, int *level_out, void **exdata_out); + +#ifdef __cplusplus +} +#endif \ No newline at end of file -- cgit v1.2.3