spread sketch wip

author: chenzizhan <[email protected]> 2024-07-09 15:13:14 +0800
committer: chenzizhan <[email protected]> 2024-07-09 15:13:14 +0800
commit: b711b50d356ffc09569d4f11ba2a0cae41045510 (patch)
tree: daef9c48b94121124d6a3d1f70bbaa9a3b494877
parent: 722993e93a3843a6240a716b2eaead585c103735 (diff)
2 files changed, 379 insertions, 0 deletions
diff --git a/src/tags/spread_sketch.c b/src/tags/spread_sketch.c
new file mode 100644
index 0000000..1a44d6a
--- /dev/null
+++ b/src/tags/spread_sketch.c
@@ -0,0 +1,339 @@
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>   
+#include <assert.h>
+
+#include "xxhash/xxhash.h"
+#include "uthash.h"
+
+#include "spread_sketch.h"
+#include "exdata.h"
+
+/*
+方案1，smart ptr。
+省内存，更符合对cell manager 类结构的期待。
+额外增加一个额外的管理smart ptr的结构体。不过，总体来说修改量不算大，比如merge和add操作基本保持原样，仅仅是对key 的malloc 和free 做修改。
+对dummy 特殊情况的支持更容易。
+
+根据测试结果，如果是每次reset bucket 的时候都重置，误差都仅仅是微微增加，如果是每次所有sketch中的key 都失去索引才删除，这个影响只会更小。可以用。
+*/
+
+/*
+方案2，把exdata 放入bucket 里，每个bucket一份。
+
+可以保留spread sketch 的风味，不会引入新的误差。
+根据实验情况，大概会多占用一半的内存，因为根据测试的经验，保存的key 总量是bucket 总数的2/3左右。哈希表本身的HH handle 也占内存，这部分反而可以节约回去。
+会让cell 的操作变得麻烦，无法借用老的cell manager 流程，get exdata 会得到一个exdata 的数组（可能多个），而非单独的一个，要对多个cell综合起来当一个cell 看。。修改量非常小，但是确实会影响代码本身的整洁度。
+
+*/
+struct smart_ptr { // todo:entry
+    int ref_count;
+    void *exdata;
+
+    char *key;
+    size_t key_len;
+    UT_hash_handle hh;
+};
+
+struct smart_ptr_table {
+    struct smart_ptr *table;
+};
+
+struct bucket {
+    struct smart_ptr *content;
+    uint32_t level;
+};
+
+struct spread_sketch {
+    int depth;
+    int width;
+
+    exdata_new_cb new_fn;
+    exdata_free_cb free_fn;
+    exdata_merge_cb merge_fn;
+    exdata_reset_cb reset_fn;
+    exdata_copy_cb copy_fn;
+
+    struct bucket *buckets;
+    struct group_table *groups;
+};
+
+static void *default_new_fn(void *arg) {
+    return NULL;
+}
+static void default_free_fn(void *exdata) {
+    return;
+}
+static void default_merge_fn(void *dest, void *src) {
+    return;
+}
+static void default_reset_fn(void *exdata) {
+    return;
+} 
+static void *default_copy_fn(void *exdata) {
+    return exdata;
+}
+static inline bool key_equal(const char *key1, size_t key1_len, const char *key2, size_t key2_len) {
+    if (key1_len != key2_len) {
+        return false;
+    }
+    return memcmp(key1, key2, key1_len) == 0;
+}
+static inline char *key_dup(const char *key, size_t key_len) {
+    char *ret = malloc(key_len);
+    memcpy(ret, key, key_len);
+    return ret;
+}
+
+const struct smart_ptr *smart_prt_table_get(struct smart_ptr_table *table, const char *key, size_t key_len, void *arg) {
+    struct smart_ptr *ret;
+    HASH_FIND(hh, table, key, key_len, ret);
+
+    if (ret != NULL) {
+        ret->ref_count++;
+    } else {
+        ret = malloc(sizeof(struct smart_ptr));
+        ret->ref_count = 1;
+        ret->key = key_dup(key, key_len);
+        ret->key_len = key_len;
+        if (arg == NULL) {
+            ret->exdata = NULL;
+        } else {
+            ret->exdata = table->new_fn(arg);
+        }
+        HASH_ADD_KEYPTR(hh, table->table, ret->key, ret->key_len, ret);
+    }
+
+    return ret;
+}
+
+int smart_prt_table_release(struct smart_ptr_table *table, const char *key, size_t key_len) {
+    struct smart_ptr *ret;
+    HASH_FIND(hh, table->table, key, key_len, ret);
+    if (ret == NULL) {
+        return -1;
+    }
+
+    ret->ref_count--;
+    if (ret->ref_count == 0) {
+        HASH_DEL(table->table, ret);
+        table->free_fn(ret->exdata);
+        free(ret->key);
+        free(ret);
+    }
+
+    return 0;
+}
+
+void smart_prt_table_free(struct smart_ptr_table *table) {
+    struct smart_ptr *current, *tmp;
+    HASH_ITER(hh, table->table, current, tmp) {
+        HASH_DEL(table->table, current);
+        table->free_fn(current->exdata);
+        free(current->key);
+        free(current);
+    }
+    free(table);
+}
+
+struct smart_ptr_table *smart_prt_table_new() {
+    struct smart_ptr_table *table = malloc(sizeof(struct smart_ptr_table));
+    table->table = NULL;
+
+    table->new_fn = default_new_fn;
+    table->free_fn = default_free_fn;
+    table->merge_fn = default_merge_fn;
+    table->reset_fn = default_reset_fn;
+    table->copy_fn = default_copy_fn;
+    return table;
+}
+
+void get_parameter_recommendation(int max_super_spreader_number, int *depth_out, int *width_out)
+{
+    int logk = max_super_spreader_number >= 3200 ? 4 : 3; // lg3200 = 3.51,round up to 4
+    *depth_out = logk;
+
+    int w;
+    if (max_super_spreader_number <= 100) {
+        w = max_super_spreader_number * 3 /2; // * 1.5, when the number is small, we need more width
+    } else {
+        w = max_super_spreader_number + 50; // + 50: 100*1.5-100 = 50, make w=f(k) continuous
+    }
+    if (w < 40) {
+        w = 40;
+    }
+    *width_out = w;
+}
+
+struct spread_sketch *spread_sketch_new(int expected_query_num) {
+    struct spread_sketch *pthis = malloc(sizeof(struct spread_sketch));
+    get_parameter_recommendation(expected_query_num, &pthis->depth, &pthis->width);
+
+    pthis->buckets = calloc(pthis->depth * pthis->width, sizeof(struct bucket));
+    pthis->table = smart_prt_table_new();
+
+    return pthis;
+}
+
+// return 0 if not added, return 1 if added
+int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg) {// todo: entry, item
+    // uint64_t hash_identifier = XXH3_64bits_withSeed(identifier, identifier_length, 171);
+    uint32_t level = (uint32_t)__builtin_clzll(hash_identifier) + 1;
+    
+    // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
+    // A technique from the hashing literature is to use two hash functions h1(x) and h2(x) to simulate additional hash functions of the form gi(x) = h1(x) + ih2(x)
+    // Assuming that the 128-bit xxhash function is perfect, we can view it as a combination of two 64-bit hash functions.
+    uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_length, 171);
+    uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32);
+    uint32_t hash_x2 = (uint32_t) hash_x_tmp;
+
+    bool in_sketch = false;
+    for (int i = 0; i < ss->depth; i++) {
+        uint32_t hash_x = hash_x1 + i * hash_x2; 
+        int bucket_idx = (hash_x % ss->width) + i * ss->width;
+        struct bucket *bucket = &ss->buckets[bucket_idx];
+
+        if (bucket->content != NULL && key_equal(bucket->content->key, bucket->content->key_len, key, key_length)) {
+            if (bucket->level < level) {
+                bucket->level = level;
+            }
+            in_sketch = true;
+        } else {
+            uint32_t true_level = bucket->content == NULL ? 0: bucket->level;
+
+            if (true_level < level) {
+                struct smart_ptr *content_old = bucket->content;
+                smart_prt_table_release(ss->table, content_old->key, content_old->key_len, ss->free_fn);
+                struct smart_ptr *content_new = smart_prt_table_get(ss->table, key, key_length, arg);
+                bucket->content = content_new;
+                bucket->level = level;
+
+                in_sketch = true;
+            }
+        }
+    }
+
+    return in_sketch ? 1 : 0;
+}
+
+void spread_sketch_free(struct spread_sketch *ss) {
+    smart_prt_table_free(ss->table, ss->free_fn);
+    free(ss->buckets);
+    free(ss);
+}
+
+void spread_sketch_merge(struct spread_sketch *dst, const struct spread_sketch *src)
+{
+    assert(dst->depth == src->depth && dst->width == src->width);
+
+    for (int i = 0; i < dst->depth * dst->width; i++) {
+        const struct bucket *bucket_src = &src->buckets[i];
+        struct bucket *bucket_dst = &dst->buckets[i];
+
+        if (bucket_src->content == NULL) {
+            continue;
+        }
+        if (bucket_dst->content == NULL) {
+            bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL);
+            bucket_dst->level = bucket_src->level;
+            continue;
+        }
+
+        if (key_equal(bucket_src->content->key, bucket_src->content->key_len, bucket_dst->content->key, bucket_dst->content->key_len)) {
+            if (bucket_src->level > bucket_dst->level) {
+                bucket_dst->level = bucket_src->level;
+            }
+        } else {
+            if (bucket_src->level > bucket_dst->level) {
+                smart_prt_table_release(dst->table, bucket_dst->content->key, bucket_dst->content->key_len);
+                bucket_dst->content = smart_prt_table_get(dst->table, bucket_src->content->key, bucket_src->content->key_len, NULL);
+                bucket_dst->level = bucket_src->level;
+            }
+        }
+    }
+
+    struct smart_ptr *content_dest, *content_src, *tmp;
+    HASH_ITER(hh, dst->table->table, content_dest, tmp) {
+        HASH_FIND(hh, src->table->table, content_dest->key, content_dest->key_len, content_src);
+        if (content_src == NULL) {
+            continue;
+        }
+
+        if (content_dest->exdata == NULL) {
+            content_dest->exdata = dst->table->copy_fn(content_src->exdata);
+        } else {
+            dst->table->merge_fn(content_dest->exdata, content_src->exdata);
+        }
+    }
+}
+
+void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len) {
+    struct smart_ptr *content;
+    HASH_FIND(hh, ss->table->table, key, key_len, content);
+    if (content == NULL) {
+        return NULL;
+    }
+    return content->exdata;
+}
+
+void spread_sketch_reset(struct spread_sketch *ss) {
+    for (int i = 0; i < ss->depth * ss->width; i++) {
+        struct bucket *bucket = &ss->buckets[i];
+        bucket->level = 0;
+    }
+
+    struct smart_ptr *content, *tmp;
+    HASH_ITER(hh, ss->table->table, content, tmp) {
+        ss->reset_fn(content->exdata);
+    }
+}
+
+void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn) {
+    ss->table->new_fn = new_fn;
+    ss->table->free_fn = free_fn;
+    ss->table->merge_fn = merge_fn;
+    ss->table->reset_fn = reset_fn;
+    ss->table->copy_fn = copy_fn;
+    return 0;
+}
+
+int spread_sketch_get_count(const struct spread_sketch *ss) {
+    return HASH_CNT(hh, ss->table->table);
+}
+
+// 这个函数还是会忠实的把内部所有内容都拿出来，但是预期是最多expected_query_num 个，所以在外层要做一个排序。
+// 这个排序在这里面没法做，因为没有具体的hll计数。
+size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas) {
+    size_t count = 0;
+    struct smart_ptr *content, *tmp;
+    HASH_ITER(hh, ss->table->table, content, tmp) {
+        if (count >= n_exdatas) {
+            break;
+        }
+        exdatas[count] = content->exdata;
+        count++;
+    }
+    return count;
+}
+
+struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src) {
+    struct spread_sketch *dst = malloc(sizeof(struct spread_sketch));
+    memcpy(dst, src, sizeof(struct spread_sketch));
+
+    dst->buckets = calloc(dst->depth * dst->width, sizeof(struct bucket));
+    dst->table = smart_prt_table_new();
+
+    for (int i = 0; i < dst->depth * dst->width; i++) {
+        if (src->buckets[i].content == NULL) {
+            continue;
+        }
+        dst->buckets[i].content = smart_prt_table_get(dst->table, src->buckets[i].content->key, src->buckets[i].content->key_len, NULL);
+        dst->buckets[i].level = src->buckets[i].level;
+        if (dst->buckets[i].content->exdata == NULL) {
+            dst->buckets[i].content->exdata = src->copy_fn(src->buckets[i].content->exdata);
+        }
+    }
+    return dst;
+}
diff --git a/src/tags/spread_sketch.h b/src/tags/spread_sketch.h
new file mode 100644
index 0000000..f50cae4
--- /dev/null
+++ b/src/tags/spread_sketch.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "exdata.h"
+
+
+struct spread_sketch;
+
+// spread sketch alway store values more than expected_query_num,expected_query_num is a hint to set spread sketch parameters properly
+struct spread_sketch *spread_sketch_new(int expected_query_num);
+
+void spread_sketch_free(struct spread_sketch *ss);
+
+void spread_sketch_reset(struct spread_sketch *ss);
+
+int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t hash_identifier, void *arg);
+void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn);
+
+void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len);
+
+// get the number of cells in the heavy keeper
+int spread_sketch_get_count(const struct spread_sketch *ss);
+
+size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t n_exdatas);
+
+void spread_sketch_merge(struct spread_sketch *dest, const struct spread_sketch *src);
+
+struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src);
+
+// for test
+// void spread_sketch_one_point_query(const struct spread_sketch *ss, const char *key, size_t key_len, int *level_out, void **exdata_out);
+
+#ifdef __cplusplus
+}
+#endif
+\ No newline at end of file
author	chenzizhan <[email protected]>	2024-07-09 15:13:14 +0800
committer	chenzizhan <[email protected]>	2024-07-09 15:13:14 +0800
commit	b711b50d356ffc09569d4f11ba2a0cae41045510 (patch)
tree	daef9c48b94121124d6a3d1f70bbaa9a3b494877
parent	722993e93a3843a6240a716b2eaead585c103735 (diff)