diff options
| author | chenzizhan <[email protected]> | 2024-07-30 10:50:12 +0800 |
|---|---|---|
| committer | chenzizhan <[email protected]> | 2024-07-30 10:50:12 +0800 |
| commit | 566db155010b97f20dfb13e303ae6a11228baa39 (patch) | |
| tree | 55a2cd29963a13d98f1cb7a94daf6db695a845ae | |
| parent | 240bbbb0e0409a6bca409eb319a0a00960cbc6fb (diff) | |
rename spread sketch key->entry
| -rw-r--r-- | src/cells/spread_sketch.c | 48 | ||||
| -rw-r--r-- | src/cells/spread_sketch.h | 33 | ||||
| -rw-r--r-- | src/cube.c | 4 |
3 files changed, 46 insertions, 39 deletions
diff --git a/src/cells/spread_sketch.c b/src/cells/spread_sketch.c index 85f123b..f95b0f1 100644 --- a/src/cells/spread_sketch.c +++ b/src/cells/spread_sketch.c @@ -201,7 +201,7 @@ struct entry_table *smart_ptr_table_new() { } struct spread_sketch *spread_sketch_new(int depth, int width, unsigned char precision, int time_window_ms, struct timeval now) { - struct spread_sketch *pthis = malloc(sizeof(struct spread_sketch)); + struct spread_sketch *pthis = calloc(1, sizeof(struct spread_sketch)); pthis->depth = depth; pthis->width = width; @@ -240,7 +240,7 @@ void move_registers_forward(struct spread_sketch *ss, const struct timeval *now) } // return 0 if not added, return 1 if added -int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t item_hash, void *arg, struct timeval now) { +int spread_sketch_add_hash(struct spread_sketch *ss, const char *entry, size_t entry_length, uint64_t item_hash, void *arg, struct timeval now) { uint32_t level = (uint32_t)__builtin_clzll(item_hash) + 1; long long now_ms = now.tv_sec * 1000 + now.tv_usec / 1000; @@ -254,7 +254,7 @@ int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf // A technique from the hashing literature is to use two hash functions h1(x) and h2(x) to simulate additional hash functions of the form gi(x) = h1(x) + ih2(x) // Assuming that the 128-bit xxhash function is perfect, we can view it as a combination of two 64-bit hash functions. - uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_length, 171); + uint64_t hash_x_tmp = XXH3_64bits_withSeed(entry, entry_length, 171); uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32); uint32_t hash_x2 = (uint32_t) hash_x_tmp; @@ -265,7 +265,7 @@ int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key int bucket_idx = (hash_x % ss->width) + i * ss->width; struct bucket *bucket = &ss->buckets[bucket_idx]; - if (bucket->content != NULL && key_equal(bucket->content->key, bucket->content->key_len, key, key_length)) { + if (bucket->content != NULL && key_equal(bucket->content->key, bucket->content->key_len, entry, entry_length)) { bucket->content->dying = false; bucket->last_update_ms = now_ms; @@ -289,7 +289,7 @@ int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key if (content_old != NULL) { smart_ptr_table_release(ss->table, content_old->key, content_old->key_len); } - struct entry *content_new = smart_ptr_table_get(ss->table, key, key_length, arg); + struct entry *content_new = smart_ptr_table_get(ss->table, entry, entry_length, arg); bucket->content = content_new; bucket->last_update_ms = now_ms; @@ -304,13 +304,13 @@ int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key return in_sketch ? 1 : 0; } -int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, const char* item, size_t item_len, void *arg, struct timeval now) { +int spread_sketch_add(struct spread_sketch *ss, const char *entry, size_t entry_length, const char* item, size_t item_len, void *arg, struct timeval now) { uint64_t hash = XXH3_64bits(item, item_len); - return spread_sketch_add_hash(ss, key, key_length, hash, arg, now); + return spread_sketch_add_hash(ss, entry, entry_length, hash, arg, now); } -double spread_sketch_point_query(const struct spread_sketch *ss, const char *key, size_t key_length) { - uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_length, 171); +double spread_sketch_point_query(const struct spread_sketch *ss, const char *entry, size_t entry_length) { + uint64_t hash_x_tmp = XXH3_64bits_withSeed(entry, entry_length, 171); uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32); uint32_t hash_x2 = (uint32_t) hash_x_tmp; @@ -344,14 +344,14 @@ struct spread_sketch *duplicate_and_step(const struct spread_sketch *ss, const s return duplicate; } -double spread_sketch_query(const struct spread_sketch *ss, const char *key, size_t key_length, struct timeval now) { +double spread_sketch_query(const struct spread_sketch *ss, const char *entry, size_t entry_length, struct timeval now) { if (ss->time_window_ms!=0 && hll_should_slide(ss->precision, ss->time_window_ms, now, ss->reset_time)) { struct spread_sketch *duplicate = duplicate_and_step(ss, &now); - double ret = spread_sketch_point_query(duplicate, key, key_length); + double ret = spread_sketch_point_query(duplicate, entry, entry_length); spread_sketch_free(duplicate); return ret; } else { - return spread_sketch_point_query(ss, key, key_length); + return spread_sketch_point_query(ss, entry, entry_length); } } @@ -494,12 +494,12 @@ size_t spread_sketch_list(const struct spread_sketch *ss, void **exdatas, size_t return count; } -void spread_sketch_list_keys(const struct spread_sketch *ss, char ***keys, size_t **key_lens, size_t *n_keys) { +void spread_sketch_list_entries(const struct spread_sketch *ss, char ***entries, size_t **entry_lens, size_t *n_entry) { size_t count_max = HASH_COUNT(ss->table->entry); size_t count = 0; - *keys = malloc(count_max * sizeof(char *)); - *key_lens = malloc(count_max * sizeof(size_t)); + *entries = malloc(count_max * sizeof(char *)); + *entry_lens = malloc(count_max * sizeof(size_t)); struct entry *content, *tmp; HASH_ITER(hh, ss->table->entry, content, tmp) { @@ -507,12 +507,12 @@ void spread_sketch_list_keys(const struct spread_sketch *ss, char ***keys, size_ continue; } - (*keys)[count] = content->key; - (*key_lens)[count] = content->key_len; + (*entries)[count] = content->key; + (*entry_lens)[count] = content->key_len; count++; } - *n_keys = count; + *n_entry = count; } double spread_sketch_get_cardinality(const struct spread_sketch *ss, const char *key, size_t key_len) { @@ -524,8 +524,8 @@ double spread_sketch_get_cardinality(const struct spread_sketch *ss, const char return est; } -char *spread_sketch_get_hll_base64_serialization(const struct spread_sketch *ss, const char *key, size_t key_len) { - uint64_t hash_x_tmp = XXH3_64bits_withSeed(key, key_len, 171); +char *spread_sketch_get_hll_base64_serialization(const struct spread_sketch *ss, const char *entry, size_t entry_len) { + uint64_t hash_x_tmp = XXH3_64bits_withSeed(entry, entry_len, 171); uint32_t hash_x1 = (uint32_t) (hash_x_tmp >> 32); uint32_t hash_x2 = (uint32_t) hash_x_tmp; uint32_t *register_ret = NULL; @@ -572,7 +572,7 @@ struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src) { return dst; } -void spread_sketch_get_parameter_recommendation(int expected_super_spreader_number, int *depth_out, int *width_out, unsigned char *precision_out) +void spread_sketch_recommend_parameters(int expected_super_spreader_number, int *depth_out, int *width_out, unsigned char *precision_out) { int logk = expected_super_spreader_number >= 3200 ? 4 : 3; // lg3200 = 3.51,round up to 4 *depth_out = logk; @@ -699,16 +699,18 @@ void spread_sketch_merge_blob(struct spread_sketch *dst, const char *blob, size_ spread_sketch_free(src); } +struct spread_sketch *spread_sketch_replicate(uuid_t uuid, const char *blob, size_t blob_sz) { + return spread_sketch_deserialize(blob, blob_sz); +} + size_t spread_sketch_calculate_memory_usage(const struct spread_sketch *ss) { size_t ret = 0; ret += sizeof(struct spread_sketch); size_t bucket_size = sizeof(struct bucket) + hll_size(ss->precision); - printf("every bucket size: %zu\n", bucket_size); ret += ss->depth * ss->width * bucket_size; - printf("the number of content: %u\n", HASH_COUNT(ss->table->entry)); struct entry *content, *tmp; HASH_ITER(hh, ss->table->entry, content, tmp) { ret += sizeof(struct entry); diff --git a/src/cells/spread_sketch.h b/src/cells/spread_sketch.h index c15ad81..abd6de2 100644 --- a/src/cells/spread_sketch.h +++ b/src/cells/spread_sketch.h @@ -10,6 +10,7 @@ extern "C"{ #include <stdint.h> //uint64_t #include <sys/time.h> // struct timeval #include <stddef.h> +#include <uuid/uuid.h> #define DUMMY_ITEM_HASH (1ULL<<63) // level(left most zeros) = 0 @@ -20,34 +21,38 @@ struct spread_sketch *spread_sketch_new(int depth, int width, unsigned char prec void spread_sketch_free(struct spread_sketch *ss); void spread_sketch_set_exdata_schema(struct spread_sketch *ss, exdata_new_cb new_fn, exdata_free_cb free_fn, exdata_merge_cb merge_fn, exdata_reset_cb reset_fn, exdata_copy_cb copy_fn); -int spread_sketch_add_hash(struct spread_sketch *ss, const char *key, size_t key_length, uint64_t item_hash, void *arg, struct timeval now); -int spread_sketch_add(struct spread_sketch *ss, const char *key, size_t key_length, const char* item, size_t item_len, void *arg, struct timeval now); +int spread_sketch_add_hash(struct spread_sketch *ss, const char *entry, size_t entry_length, uint64_t item_hash, void *arg, struct timeval now); +int spread_sketch_add(struct spread_sketch *ss, const char *entry, size_t entry_length, const char* item, size_t item_len, void *arg, struct timeval now); -// get the number of keys stored in spread sketch +// get the number of entrys stored in spread sketch int spread_sketch_get_count(const struct spread_sketch *ss); -// list all the keys in spread sketch. User should free the arrays, but do not free the elements of strings in the array(because they are references to the internal data structure) -// Example: char **key; size_t *key_len; size_t n_keys; spread_sketch_list_keys(&key, &key_len, &n_keys); free(key); free(key_len); -void spread_sketch_list_keys(const struct spread_sketch *ss, char ***keys, size_t **key_lens, size_t *n_keys); -// query the cardinality(or fanout) of a key in spread sketch. -// Even thought spread sketch algorithm does not requires keys to exist innately, when querying a key that is not present in the spread sketch, `spread_sketch_get_cardinality` will return -1. -double spread_sketch_get_cardinality(const struct spread_sketch *ss, const char *key, size_t key_len); +// list all the entrys in spread sketch. User should free the arrays, but do not free the elements of strings in the array(because they are references to the internal data structure) +// Example: char **entry; size_t *entry_len; size_t n_entrys; spread_sketch_list_entrys(&entry, &entry_len, &n_entrys); free(entry); free(entry_len); +void spread_sketch_list_entries(const struct spread_sketch *ss, char ***entries, size_t **entry_lens, size_t *n_entry); +// query the cardinality(or fanout) of a entry in spread sketch. +// Even thought spread sketch algorithm does not requires entrys to exist innately, when querying a entry that is not present in the spread sketch, `spread_sketch_get_cardinality` will return -1. +double spread_sketch_get_cardinality(const struct spread_sketch *ss, const char *entry, size_t entry_len); // query a hyperloglog 's base64 serialization. The serialization format is [1,precision,register...] and then encoded by base64 -char *spread_sketch_get_hll_base64_serialization(const struct spread_sketch *ss, const char *key, size_t key_len); -void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *key, size_t key_len); +char *spread_sketch_get_hll_base64_serialization(const struct spread_sketch *ss, const char *entry, size_t entry_len); +void *spread_sketch_get0_exdata(const struct spread_sketch *ss, const char *entry, size_t entry_len); // in most cases, it has the same output as `spread_sketch_get_cardinality`, but it will perform more like an ordinary spread sketch query. -// Will always return a value, even if the key is not present in the spread sketch. Must pass a `now` value required by Stagger hll query. -double spread_sketch_query(const struct spread_sketch *ss, const char *key, size_t key_length, struct timeval now); +// Will always return a value, even if the entry is not present in the spread sketch. Must pass a `now` value required by Stagger hll query. +double spread_sketch_query(const struct spread_sketch *ss, const char *entry, size_t entry_length, struct timeval now); void spread_sketch_merge(struct spread_sketch *dest, const struct spread_sketch *src); struct spread_sketch *spread_sketch_copy(const struct spread_sketch *src); void spread_sketch_serialize(const struct spread_sketch *ss, char **blob, size_t *blob_sz); struct spread_sketch *spread_sketch_deserialize(const char *blob, size_t blob_sz); +// exactly the same as `spread_sketch_deserialize`. `uuid` is a dummy parameter.(since spread sketch itself performs very like set CRDT, it does not need uuid to sync replicas) +struct spread_sketch *spread_sketch_replicate(uuid_t uuid, const char *blob, size_t blob_sz); + +struct spread_sketch *spread_sketch_replicate(uuid_t uuid, const char *blob, size_t blob_sz); void spread_sketch_merge_blob(struct spread_sketch *dst, const char *blob, size_t blob_sz); void spread_sketch_reset(struct spread_sketch *ss); void spread_sketch_get_parameter(const struct spread_sketch *ss, int *depth_out, int *width_out, unsigned char *precision_out, int *time_window_ms_out); // spread sketch alway store values more than expected_query_num,expected_query_num is a hint to set spread sketch parameters properly -void spread_sketch_get_parameter_recommendation(int expected_super_spreader_number, int *depth_out, int *width_out, unsigned char *precision_out); +void spread_sketch_recommend_parameters(int expected_super_spreader_number, int *depth_out, int *width_out, unsigned char *precision_out); size_t spread_sketch_calculate_memory_usage(const struct spread_sketch *ss); #ifdef __cplusplus @@ -556,7 +556,7 @@ int cube_set_sampling(struct cube *cube, enum sampling_mode mode, int max_n_cell case SAMPLING_MODE_TOP_CARDINALITY: { int width, depth; unsigned char precision; - spread_sketch_get_parameter_recommendation(max_n_cell, &depth, &width, &precision); + spread_sketch_recommend_parameters(max_n_cell, &depth, &width, &precision); cube->spread_sketch = spread_sketch_new(depth, width, precision, 0, DUMMY_TIME_VAL); spread_sketch_set_exdata_schema(cube->spread_sketch, exdata_new_i, exdata_free_i, exdata_merge_i, exdata_reset_i, exdata_copy_i); break; } @@ -1093,7 +1093,7 @@ void cube_get_cells(const struct cube *cube, struct field_list **cell_dimensions heavy_keeper_list(cube->heavykeeper, (void **)cell_datas, n_cell_tmp); break; case SAMPLING_MODE_TOP_CARDINALITY: { - spread_sketch_list_keys(cube->spread_sketch, &spread_sketch_keys, &spread_sketch_keys_lens, &n_cell_tmp); + spread_sketch_list_entries(cube->spread_sketch, &spread_sketch_keys, &spread_sketch_keys_lens, &n_cell_tmp); for (int i = 0; i < n_cell_tmp; i++) { cell_datas[i] = spread_sketch_get0_exdata(cube->spread_sketch, spread_sketch_keys[i], spread_sketch_keys_lens[i]); } |
