summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchenzizhan <[email protected]>2024-07-09 15:13:14 +0800
committerchenzizhan <[email protected]>2024-07-09 15:13:14 +0800
commitb711b50d356ffc09569d4f11ba2a0cae41045510 (patch)
treedaef9c48b94121124d6a3d1f70bbaa9a3b494877
parent722993e93a3843a6240a716b2eaead585c103735 (diff)
spread sketch wip
-rw-r--r--src/tags/spread_sketch.c339
-rw-r--r--src/tags/spread_sketch.h40
2 files changed, 379 insertions, 0 deletions
diff --git a/src/tags/spread_sketch.c b/src/tags/spread_sketch.c
new file mode 100644
index 0000000..1a44d6a
--- /dev/null
+++ b/src/tags/spread_sketch.c
@@ -0,0 +1,339 @@
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#include "xxhash/xxhash.h"
+#include "uthash.h"
+
+#include "spread_sketch.h"
+#include "exdata.h"
+
+/*
+方案1,smart ptr。
+省内存,更符合对cell manager 类结构的期待。
+额外增加一个额外的管理smart ptr的结构体。不过,总体来说修改量不算大,比如merge和add操作基本保持原样,仅仅是对key 的malloc 和free 做修改。
+对dummy 特殊情况的支持更容易。
+
+根据测试结果,如果是每次reset bucket 的时候都重置,误差都仅仅是微微增加,如果是每次所有sketch中的key 都失去索引才删除,这个影响只会更小。可以用。
+*/
+
+/*
+方案2,把exdata 放入bucket 里,每个bucket一份。
+
+可以保留spread sketch 的风味,不会引入新的误差。
+根据实验情况,大概会多占用一半的内存,因为根据测试的经验,保存的key 总量是bucket 总数的2/3左右。哈希表本身的HH handle 也占内存,这部分反而可以节约回去。
+会让cell 的操作变得麻烦,无法借用老的cell manager 流程,get exdata 会得到一个exdata 的数组(可能多个),而非单独的一个,要对多个cell综合起来当一个cell 看。。修改量非常小,但是确实会影响代码本身的整洁度。
+
+*/
+struct smart_ptr { // todo:entry
+ int ref_count;
+ void *exdata;
+
+ char *key;
+ size_t key_len;
+ UT_hash_handle hh;
+};
+
+struct smart_ptr_table {
+ struct smart_ptr *table;
+};
+
+struct bucket {
+ struct smart_ptr *content;
+ uint32_t level;
+};
+
+struct spread_sketch {
+ int depth;
+ int width;
+
+ exdata_new_cb new_fn;
+ exdata_free_cb free_fn;
+ exdata_merge_cb merge_fn;
+ exdata_reset_cb reset_fn;
+ exdata_copy_cb copy_fn;
+
+ struct bucket *buckets;
+ struct group_table *groups;
+};
+
+static void *default_new_fn(void *arg) {
+ return NULL;
+}
+static void default_free_fn(void *exdata) {
+ return;
+}
+static void default_merge_fn(void *dest, void *src) {
+ return;
+}
+static void default_reset_fn(void *exdata) {
+ return;
+}
+static void *default_copy_fn(void *exdata) {
+ return exdata;
+}
+static inline bool key_equal(const char *key1, size_t key1_len, const char *key2, size_t key2_len) {
+ if (key1_len != key2_len) {
+ return false;
+ }
+ return memcmp(key1, key2, key1_len) == 0;
+}
+static inline char *key_dup(const char *key, size_t key_len) {
+ char *ret = malloc(key_len);
+ memcpy(ret, key, key_len);
+ return ret;
+}
+
+const struct smart_ptr *smart_prt_table_get(struct smart_ptr_table *table, const char *key, size_t key_len, void *arg) {
+ struct smart_ptr *ret;
+ HASH_FIND(hh, table, key, key_len, ret);
+
+ if (ret != NULL) {
+ ret->ref_count++;
+ } else {
+ ret = malloc(sizeof(struct smart_ptr));
+ ret->ref_count = 1;
+ ret->key = key_dup(key, key_len);
+ ret->key_len = key_len;
+ if (arg == NULL) {
+ ret->exdata = NULL;
+ } else {
+ ret->exdata = table->new_fn(arg);
+ }
+ HASH_ADD_KEYPTR(hh, table->table, ret->key, ret->key_len, ret);
+ }
+
+ return ret;
+}
+
+int smart_prt_table_release(struct smart_ptr_table *table, const char *key, size_t key_len) {
+ struct smart_ptr *ret;
+ HASH_FIND(hh, table->table, key, key_len, ret);
+ if (ret == NULL) {
+ return -1;
+ }
+
+ ret->ref_count--;
+ if (ret->ref_count == 0) {
+ HASH_DEL(table->table, ret);
+ table->free_fn(ret->exdata);
+ free(ret->key);
+ free(ret);
+ }
+
+ return 0;
+}
+
+void smart_prt_table_free(struct smart_ptr_table *table) {
+ struct smart_ptr *current, *tmp;
+ HASH_ITER(hh, table->table, current, tmp) {
+ HASH_DEL(table->table, current);
+ table->free_fn(current->exdata);
+ free(current->key);
+ free(current);
+ }
+ free(table);
+}
+
+struct smart_ptr_table *smart_prt_table_new() {
+ struct smart_ptr_table *table = malloc(sizeof(struct smart_ptr_table));
+ table->table = NULL;
+
+ table->new_fn = default_new_fn;
+ table->free_fn = default_free_fn;
+ table->merge_fn = default_merge_fn;
+ table->reset_fn = default_reset_fn;
+ table->copy_fn = default_copy_fn;
+ return table;
+}
+
+void get_parameter_recommendation(int max_super_spreader_number, int *depth_out, int *width_out)
+{
+ int logk = max_super_spreader_number >= 3200 ? 4 : 3; // lg3200 = 3.51,round up to 4
+ *depth_out = logk;
+
+ int w;
+ if (max_super_spreader_number <= 100) {
+ w = max_super_spreader_number * 3 /2; // * 1.5, when the number is small, we need more width
+ } else {
+ w = max_super_spreader_number + 50; // + 50: 100*1.5-100 = 50, make w=f(k) continuous
+ }
+ if (w < 40) {
+ w = 40;
+ }
+ *width_out = w;
+}
+
+struct spread_sketch *spread_sketch_new(int expected_query_num) {
+ struct spread_sketch *pthis = malloc(sizeof(struct spread_sketch));
+ get_parameter_recommendation(expected_query_num, &pthis->depth, &pthis->width);
+
+ pthis->buckets = calloc(pthis->depth * pthis->width, sizeof(struct bucket));
+ pthis->table = smart_prt_table_new();
+
+ return pthis;
+}
+
+// return 0 if not added, return 1 if added
+int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg) {// todo: entry, item
+ // uint64_t hash_identifier = XXH3_64bits_withSeed(identifier, identifier_length, 171);
+ uint32_t level = (uint32_t)__builtin_clzll(hash_identifier) + 1;
+
+ // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
+ // A technique from the hashing literature is to use two hash functions h1(x) and h2(x) to simulate additional hash functions of the form gi(x) = h1(x) + ih2(x)
+ // Assuming that the 128-bit xxhash function is perfect, we can view it as a combination of two 64-bit hash functions.
+ uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_length, 171);
+ uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32);
+ uint32_t hash_x2 = (uint32_t) hash_x_tmp;
+
+ bool in_sketch = false;
+ for (int i = 0; i < ss->depth; i++) {
+ uint32_t hash_x = hash_x1 + i * hash_x2;
+ int bucket_idx = (hash_x % ss->width) + i * ss->width;
+ struct bucket *bucket = &ss->buckets[bucket_idx];
+
+ if (bucket->content != NULL && key_equal(bucket->content->key, bucket->content->key_len, key, key_length)) {
+ if (bucket->level < level) {
+ bucket->level = level;
+ }
+ in_sketch = true;
+ } else {
+ uint32_t true_level = bucket->content == NULL ? 0: bucket->level;
+
+ if (true_level < level) {
+ struct smart_ptr *content_old = bucket->content;
+ smart_prt_table_release(ss->table, content_old->key, content_old->key_len, ss->free_fn);
+ struct smart_ptr *content_new = smart_prt_table_get(ss->table, key, key_length, arg);
+ bucket->content = content_new;
+ bucket->level = level;
+
+ in_sketch = true;
+ }
+ }
+ }
+
+ return in_sketch ? 1 : 0;
+}
+
+void spread_sketch_free(struct spread_sketch *ss) {
+ smart_prt_table_free(ss->table, ss->free_fn);
+ free(ss->buckets);
+ free(ss);
+}
+
+void spread_sketch_merge(struct spread_sketch *dst, const struct spread_sketch *src)
+{
+ assert(dst->depth == src->depth && dst->width == src->width);
+
+ for (int i = 0; i < dst->depth * dst->width; i++) {
+ const struct bucket *bucket_src = &src->buckets[i];
+ struct bucket *bucket_dst = &dst->buckets[i];
+
+ if (bucket_src->content == NULL) {
+ continue;
+ }
+ if (bucket_dst->content == NULL) {
+ bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL);
+ bucket_dst->level = bucket_src->level;
+ continue;
+ }
+
+ if (key_equal(bucket_src->content->key, bucket_src->content->key_len, bucket_dst->content->key, bucket_dst->content->key_len)) {
+ if (bucket_src->level > bucket_dst->level) {
+ bucket_dst->level = bucket_src->level;
+ }
+ } else {
+ if (bucket_src->level > bucket_dst->level) {
+ smart_prt_table_release(dst->table, bucket_dst->content->key, bucket_dst->content->key_len);
+ bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL);
+ bucket_dst->level = bucket_src->level;
+ }
+ }
+ }
+
+ struct smart_ptr *content_dest, *content_src, *tmp;
+ HASH_ITER(hh, dst->table->table, content_dest, tmp) {
+ HASH_FIND(hh, src->table->table, content_dest->key, content_dest->key_len, content_src);
+ if (content_src == NULL) {
+ continue;
+ }
+
+ if (content_dest->exdata == NULL) {
+ content_dest->exdata = dst->table->copy_fn(content_src->exdata);
+ } else {
+ dst->table->merge_fn(content_dest->exdata, content_src->exdata);
+ }
+ }
+}
+
+void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len) {
+ struct smart_ptr *content;
+ HASH_FIND(hh, ss->table->table, key, key_len, content);
+ if (content == NULL) {
+ return NULL;
+ }
+ return content->exdata;
+}
+
+void spread_sketch_reset(struct spread_sketch *ss) {
+ for (int i = 0; i < ss->depth * ss->width; i++) {
+ struct bucket *bucket = &ss->buckets[i];
+ bucket->level = 0;
+ }
+
+ struct smart_ptr *content, *tmp;
+ HASH_ITER(hh, ss->table->table, content, tmp) {
+ ss->reset_fn(content->exdata);
+ }
+}
+
+void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn) {
+ ss->table->new_fn = new_fn;
+ ss->table->free_fn = free_fn;
+ ss->table->merge_fn = merge_fn;
+ ss->table->reset_fn = reset_fn;
+ ss->table->copy_fn = copy_fn;
+ return 0;
+}
+
+int spread_sketch_get_count(const struct spread_sketch *ss) {
+ return HASH_CNT(hh, ss->table->table);
+}
+
+// 这个函数还是会忠实的把内部所有内容都拿出来,但是预期是最多expected_query_num 个,所以在外层要做一个排序。
+// 这个排序在这里面没法做,因为没有具体的hll计数。
+size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas) {
+ size_t count = 0;
+ struct smart_ptr *content, *tmp;
+ HASH_ITER(hh, ss->table->table, content, tmp) {
+ if (count >= n_exdatas) {
+ break;
+ }
+ exdatas[count] = content->exdata;
+ count++;
+ }
+ return count;
+}
+
+struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src) {
+ struct spread_sketch *dst = malloc(sizeof(struct spread_sketch));
+ memcpy(dst, src, sizeof(struct spread_sketch));
+
+ dst->buckets = calloc(dst->depth * dst->width, sizeof(struct bucket));
+ dst->table = smart_prt_table_new();
+
+ for (int i = 0; i < dst->depth * dst->width; i++) {
+ if (src->buckets[i].content == NULL) {
+ continue;
+ }
+ dst->buckets[i].content = smart_prt_table_get(dst->table, src->buckets[i].content->key, src->buckets[i].content->key_len, NULL);
+ dst->buckets[i].level = src->buckets[i].level;
+ if (dst->buckets[i].content->exdata == NULL) {
+ dst->buckets[i].content->exdata = src->copy_fn(src->buckets[i].content->exdata);
+ }
+ }
+ return dst;
+}
diff --git a/src/tags/spread_sketch.h b/src/tags/spread_sketch.h
new file mode 100644
index 0000000..f50cae4
--- /dev/null
+++ b/src/tags/spread_sketch.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "exdata.h"
+
+
+struct spread_sketch;
+
+// spread sketch alway store values more than expected_query_num,expected_query_num is a hint to set spread sketch parameters properly
+struct spread_sketch *spread_sketch_new(int expected_query_num);
+
+void spread_sketch_free(struct spread_sketch *ss);
+
+void spread_sketch_reset(struct spread_sketch *ss);
+
+int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg);
+void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn);
+
+void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len);
+
+// get the number of cells in the heavy keeper
+int spread_sketch_get_count(const struct spread_sketch *ss);
+
+size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas);
+
+void spread_sketch_merge(struct spread_sketch *dest, const struct spread_sketch *src);
+
+struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src);
+
+// for test
+// void spread_sketch_one_point_query(const struct spread_sketch *ss, const char *key, size_t key_len, int *level_out, void **exdata_out);
+
+#ifdef __cplusplus
+}
+#endif \ No newline at end of file