topk hash 1 time less

author: chenzizhan <[email protected]> 2024-07-04 17:59:27 +0800
committer: chenzizhan <[email protected]> 2024-07-04 17:59:27 +0800
commit: 848b9ce1453b0a551920a1345c0279887a0ab96a (patch)
tree: b4fda4ced65167c0bdaa955ab5bed5f638ab8a9a
parent: ef42511b52486a2b79b28b8c9cd3bb68fec2b940 (diff)
3 files changed, 13 insertions, 20 deletions
diff --git a/src/tags/heavy_keeper.c b/src/tags/heavy_keeper.c
index c363199..76ccefa 100644
--- a/src/tags/heavy_keeper.c
+++ b/src/tags/heavy_keeper.c
@@ -479,8 +479,8 @@ struct Bucket *new_sketch(struct heavy_keeper_options *params) {
 }
 
 void params_set_to_default(struct heavy_keeper_options *p, int K) {
-    if (K > 1000) { // don't let the sketch too large when K gets insanely large
-        K = 1000;
+    if (K > 5000) { // don't let the sketch too large when K gets insanely large
+        K = 5000;
     }
     double log_ret = log((double)K) / 2.3;  // 2.3: log_e(10), log_ret = log_10(K)
 
@@ -563,16 +563,6 @@ unsigned int cal_hash_val_with_seed(const char *key, size_t key_len, unsigned in
     return XXH3_64bits_withSeed(key, key_len, seed);
 }
 
-struct Bucket *find_bucket_by_key(struct heavy_keeper *hk, int array_index, const char *key, size_t key_len) {
-    int w = hk->params.max_bucket_num;
-    unsigned int Hsh = cal_hash_val_with_seed(key, key_len, array_index + 1) % w; // +1: the let any row do not use FP as hashing key directly.
-
-    // Otherwise, when different key has the same FP(in 2^-64 probability), they will also in the same bucket in the row #0 and hash collision will happen more severely.
-
-    return &(hk->sketch[array_index * w + Hsh]);
-}
-
-
 unsigned my_max(unsigned a, unsigned b) {
     if (a > b) {
         return a;
@@ -606,7 +596,8 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t
     unsigned int FP = cal_hash_val_with_seed(key, key_len, FP_HASH_KEY);
 
     for (int j = 0; j < heavy_keeper->params.array_num; j++) {
-        struct Bucket *bucket = find_bucket_by_key(heavy_keeper, j, key, key_len);
+        unsigned int Hsh = j == FP_HASH_KEY ? FP : cal_hash_val_with_seed(key, key_len, j);
+        struct Bucket *bucket = &(heavy_keeper->sketch[j * heavy_keeper->params.max_bucket_num + (Hsh% heavy_keeper->params.max_bucket_num)]);
 
         if (bucket->finger_print == FP) {
             // If a key is not in the min-heap, then the estimated key size should be no larger than nmin.
@@ -616,8 +607,7 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t
             // In this case, the sketch won't be updated. This key is expected to be taken into account in another array,
             //  where its FP is different from the one it should collided with, so that element keys won't be missed.
             if (not_in_sorted_set) {
-                unsigned tmp_min = sorted_set_get_min_count(summary);
-                if (bucket->count > tmp_min) {
+                if (bucket->count > sorted_set_get_min_count(summary)) {
                     continue;
                 }
             }
@@ -731,7 +721,8 @@ unsigned long long find_maxv_in_sketch(struct heavy_keeper *hk, const char *key,
 
     unsigned long long maxv = 0;
     for (int array_id = 0; array_id < hk->params.array_num; array_id++) {
-        bucket = find_bucket_by_key(hk, array_id, key, key_len);  // hk->sketch is the merge of two. So just check one
+        unsigned int hash = array_id == FP_HASH_KEY ? fp : cal_hash_val_with_seed(key, key_len, array_id);
+        bucket = &(hk->sketch[array_id * hk->params.max_bucket_num + (hash % hk->params.max_bucket_num)]);
 
         if (bucket->finger_print == fp) {
             maxv = MAX(maxv, bucket->count);
diff --git a/test/test_fuzz_test.cpp b/test/test_fuzz_test.cpp
index 351fe73..96e5405 100644
--- a/test/test_fuzz_test.cpp
+++ b/test/test_fuzz_test.cpp
@@ -307,7 +307,9 @@ TEST(Fuzz_test, many_instance_random_flow_unregister_calibrate_reset_fork_merge_
             test_result.push_back(new Fieldstat_tag_list_wrapper(&tags[j]));
         }
 
-        EXPECT_GE(test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]), 0.9);
+        double accuracy = test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]);
+        EXPECT_GE(accuracy, 0.95);
+        // printf("topk accuracy: %lf\n", accuracy);
 
         for (size_t j = 0; j < cell_num; j++) {
             delete test_result[j];
diff --git a/test/test_merge.cpp b/test/test_merge.cpp
index f957555..8f88c80 100644
--- a/test/test_merge.cpp
+++ b/test/test_merge.cpp
@@ -447,9 +447,9 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_f
 TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add)
 {
     int K = 100;
-    vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(10000, K + 50); // let elephant flows in src and dest different
+    vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(30000, K + 50); // let elephant flows in src and dest different
     struct fieldstat *instance_src = test_push_flows(flows_in_src, K);
-    vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(10000, K + 50);
+    vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(30000, K + 50);
     struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K);
     fieldstat_merge(instance_dest, instance_src);
 
@@ -463,7 +463,7 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_m
     
     flows_in_dest.insert(flows_in_dest.end(), std::make_move_iterator(flows_in_src.begin()), std::make_move_iterator(flows_in_src.end()));
     double accuracy = test_cal_accuracy_given_expected_key(flows_in_dest, flows_in_merged);
-    EXPECT_GE(accuracy, 0.87); // by heavy keeper benchmark, with K = 100, merging result should be about 0.96, for adding the flows will also cause some inaccuracy, so here we set 0.93
+    EXPECT_GE(accuracy, 0.87);
     printf("merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add accuracy is %lf\n", accuracy);
 
     fieldstat_free(instance_src);
author	chenzizhan <[email protected]>	2024-07-04 17:59:27 +0800
committer	chenzizhan <[email protected]>	2024-07-04 17:59:27 +0800
commit	848b9ce1453b0a551920a1345c0279887a0ab96a (patch)
tree	b4fda4ced65167c0bdaa955ab5bed5f638ab8a9a
parent	ef42511b52486a2b79b28b8c9cd3bb68fec2b940 (diff)