summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchenzizhan <[email protected]>2024-07-04 17:59:27 +0800
committerchenzizhan <[email protected]>2024-07-04 17:59:27 +0800
commit848b9ce1453b0a551920a1345c0279887a0ab96a (patch)
treeb4fda4ced65167c0bdaa955ab5bed5f638ab8a9a
parentef42511b52486a2b79b28b8c9cd3bb68fec2b940 (diff)
topk hash 1 time less
-rw-r--r--src/tags/heavy_keeper.c23
-rw-r--r--test/test_fuzz_test.cpp4
-rw-r--r--test/test_merge.cpp6
3 files changed, 13 insertions, 20 deletions
diff --git a/src/tags/heavy_keeper.c b/src/tags/heavy_keeper.c
index c363199..76ccefa 100644
--- a/src/tags/heavy_keeper.c
+++ b/src/tags/heavy_keeper.c
@@ -479,8 +479,8 @@ struct Bucket *new_sketch(struct heavy_keeper_options *params) {
}
void params_set_to_default(struct heavy_keeper_options *p, int K) {
- if (K > 1000) { // don't let the sketch too large when K gets insanely large
- K = 1000;
+ if (K > 5000) { // don't let the sketch too large when K gets insanely large
+ K = 5000;
}
double log_ret = log((double)K) / 2.3; // 2.3: log_e(10), log_ret = log_10(K)
@@ -563,16 +563,6 @@ unsigned int cal_hash_val_with_seed(const char *key, size_t key_len, unsigned in
return XXH3_64bits_withSeed(key, key_len, seed);
}
-struct Bucket *find_bucket_by_key(struct heavy_keeper *hk, int array_index, const char *key, size_t key_len) {
- int w = hk->params.max_bucket_num;
- unsigned int Hsh = cal_hash_val_with_seed(key, key_len, array_index + 1) % w; // +1: the let any row do not use FP as hashing key directly.
-
- // Otherwise, when different key has the same FP(in 2^-64 probability), they will also in the same bucket in the row #0 and hash collision will happen more severely.
-
- return &(hk->sketch[array_index * w + Hsh]);
-}
-
-
unsigned my_max(unsigned a, unsigned b) {
if (a > b) {
return a;
@@ -606,7 +596,8 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t
unsigned int FP = cal_hash_val_with_seed(key, key_len, FP_HASH_KEY);
for (int j = 0; j < heavy_keeper->params.array_num; j++) {
- struct Bucket *bucket = find_bucket_by_key(heavy_keeper, j, key, key_len);
+ unsigned int Hsh = j == FP_HASH_KEY ? FP : cal_hash_val_with_seed(key, key_len, j);
+ struct Bucket *bucket = &(heavy_keeper->sketch[j * heavy_keeper->params.max_bucket_num + (Hsh% heavy_keeper->params.max_bucket_num)]);
if (bucket->finger_print == FP) {
// If a key is not in the min-heap, then the estimated key size should be no larger than nmin.
@@ -616,8 +607,7 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t
// In this case, the sketch won't be updated. This key is expected to be taken into account in another array,
// where its FP is different from the one it should collided with, so that element keys won't be missed.
if (not_in_sorted_set) {
- unsigned tmp_min = sorted_set_get_min_count(summary);
- if (bucket->count > tmp_min) {
+ if (bucket->count > sorted_set_get_min_count(summary)) {
continue;
}
}
@@ -731,7 +721,8 @@ unsigned long long find_maxv_in_sketch(struct heavy_keeper *hk, const char *key,
unsigned long long maxv = 0;
for (int array_id = 0; array_id < hk->params.array_num; array_id++) {
- bucket = find_bucket_by_key(hk, array_id, key, key_len); // hk->sketch is the merge of two. So just check one
+ unsigned int hash = array_id == FP_HASH_KEY ? fp : cal_hash_val_with_seed(key, key_len, array_id);
+ bucket = &(hk->sketch[array_id * hk->params.max_bucket_num + (hash % hk->params.max_bucket_num)]);
if (bucket->finger_print == fp) {
maxv = MAX(maxv, bucket->count);
diff --git a/test/test_fuzz_test.cpp b/test/test_fuzz_test.cpp
index 351fe73..96e5405 100644
--- a/test/test_fuzz_test.cpp
+++ b/test/test_fuzz_test.cpp
@@ -307,7 +307,9 @@ TEST(Fuzz_test, many_instance_random_flow_unregister_calibrate_reset_fork_merge_
test_result.push_back(new Fieldstat_tag_list_wrapper(&tags[j]));
}
- EXPECT_GE(test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]), 0.9);
+ double accuracy = test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]);
+ EXPECT_GE(accuracy, 0.95);
+ // printf("topk accuracy: %lf\n", accuracy);
for (size_t j = 0; j < cell_num; j++) {
delete test_result[j];
diff --git a/test/test_merge.cpp b/test/test_merge.cpp
index f957555..8f88c80 100644
--- a/test/test_merge.cpp
+++ b/test/test_merge.cpp
@@ -447,9 +447,9 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_f
TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add)
{
int K = 100;
- vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(10000, K + 50); // let elephant flows in src and dest different
+ vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(30000, K + 50); // let elephant flows in src and dest different
struct fieldstat *instance_src = test_push_flows(flows_in_src, K);
- vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(10000, K + 50);
+ vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(30000, K + 50);
struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K);
fieldstat_merge(instance_dest, instance_src);
@@ -463,7 +463,7 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_m
flows_in_dest.insert(flows_in_dest.end(), std::make_move_iterator(flows_in_src.begin()), std::make_move_iterator(flows_in_src.end()));
double accuracy = test_cal_accuracy_given_expected_key(flows_in_dest, flows_in_merged);
- EXPECT_GE(accuracy, 0.87); // by heavy keeper benchmark, with K = 100, merging result should be about 0.96, for adding the flows will also cause some inaccuracy, so here we set 0.93
+ EXPECT_GE(accuracy, 0.87);
printf("merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add accuracy is %lf\n", accuracy);
fieldstat_free(instance_src);