diff options
| author | chenzizhan <[email protected]> | 2024-07-04 17:59:27 +0800 |
|---|---|---|
| committer | chenzizhan <[email protected]> | 2024-07-04 17:59:27 +0800 |
| commit | 848b9ce1453b0a551920a1345c0279887a0ab96a (patch) | |
| tree | b4fda4ced65167c0bdaa955ab5bed5f638ab8a9a | |
| parent | ef42511b52486a2b79b28b8c9cd3bb68fec2b940 (diff) | |
topk hash 1 time less
| -rw-r--r-- | src/tags/heavy_keeper.c | 23 | ||||
| -rw-r--r-- | test/test_fuzz_test.cpp | 4 | ||||
| -rw-r--r-- | test/test_merge.cpp | 6 |
3 files changed, 13 insertions, 20 deletions
diff --git a/src/tags/heavy_keeper.c b/src/tags/heavy_keeper.c index c363199..76ccefa 100644 --- a/src/tags/heavy_keeper.c +++ b/src/tags/heavy_keeper.c @@ -479,8 +479,8 @@ struct Bucket *new_sketch(struct heavy_keeper_options *params) { } void params_set_to_default(struct heavy_keeper_options *p, int K) { - if (K > 1000) { // don't let the sketch too large when K gets insanely large - K = 1000; + if (K > 5000) { // don't let the sketch too large when K gets insanely large + K = 5000; } double log_ret = log((double)K) / 2.3; // 2.3: log_e(10), log_ret = log_10(K) @@ -563,16 +563,6 @@ unsigned int cal_hash_val_with_seed(const char *key, size_t key_len, unsigned in return XXH3_64bits_withSeed(key, key_len, seed); } -struct Bucket *find_bucket_by_key(struct heavy_keeper *hk, int array_index, const char *key, size_t key_len) { - int w = hk->params.max_bucket_num; - unsigned int Hsh = cal_hash_val_with_seed(key, key_len, array_index + 1) % w; // +1: the let any row do not use FP as hashing key directly. - - // Otherwise, when different key has the same FP(in 2^-64 probability), they will also in the same bucket in the row #0 and hash collision will happen more severely. - - return &(hk->sketch[array_index * w + Hsh]); -} - - unsigned my_max(unsigned a, unsigned b) { if (a > b) { return a; @@ -606,7 +596,8 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t unsigned int FP = cal_hash_val_with_seed(key, key_len, FP_HASH_KEY); for (int j = 0; j < heavy_keeper->params.array_num; j++) { - struct Bucket *bucket = find_bucket_by_key(heavy_keeper, j, key, key_len); + unsigned int Hsh = j == FP_HASH_KEY ? FP : cal_hash_val_with_seed(key, key_len, j); + struct Bucket *bucket = &(heavy_keeper->sketch[j * heavy_keeper->params.max_bucket_num + (Hsh% heavy_keeper->params.max_bucket_num)]); if (bucket->finger_print == FP) { // If a key is not in the min-heap, then the estimated key size should be no larger than nmin. @@ -616,8 +607,7 @@ int heavy_keeper_add(struct heavy_keeper *heavy_keeper, const char *key, size_t // In this case, the sketch won't be updated. This key is expected to be taken into account in another array, // where its FP is different from the one it should collided with, so that element keys won't be missed. if (not_in_sorted_set) { - unsigned tmp_min = sorted_set_get_min_count(summary); - if (bucket->count > tmp_min) { + if (bucket->count > sorted_set_get_min_count(summary)) { continue; } } @@ -731,7 +721,8 @@ unsigned long long find_maxv_in_sketch(struct heavy_keeper *hk, const char *key, unsigned long long maxv = 0; for (int array_id = 0; array_id < hk->params.array_num; array_id++) { - bucket = find_bucket_by_key(hk, array_id, key, key_len); // hk->sketch is the merge of two. So just check one + unsigned int hash = array_id == FP_HASH_KEY ? fp : cal_hash_val_with_seed(key, key_len, array_id); + bucket = &(hk->sketch[array_id * hk->params.max_bucket_num + (hash % hk->params.max_bucket_num)]); if (bucket->finger_print == fp) { maxv = MAX(maxv, bucket->count); diff --git a/test/test_fuzz_test.cpp b/test/test_fuzz_test.cpp index 351fe73..96e5405 100644 --- a/test/test_fuzz_test.cpp +++ b/test/test_fuzz_test.cpp @@ -307,7 +307,9 @@ TEST(Fuzz_test, many_instance_random_flow_unregister_calibrate_reset_fork_merge_ test_result.push_back(new Fieldstat_tag_list_wrapper(&tags[j])); } - EXPECT_GE(test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]), 0.9); + double accuracy = test_cal_topk_accuracy(test_result, count_map[Fieldstat_tag_list_wrapper(shared_tag_out).to_string()]); + EXPECT_GE(accuracy, 0.95); + // printf("topk accuracy: %lf\n", accuracy); for (size_t j = 0; j < cell_num; j++) { delete test_result[j]; diff --git a/test/test_merge.cpp b/test/test_merge.cpp index f957555..8f88c80 100644 --- a/test/test_merge.cpp +++ b/test/test_merge.cpp @@ -447,9 +447,9 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_f TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add) { int K = 100; - vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(10000, K + 50); // let elephant flows in src and dest different + vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(30000, K + 50); // let elephant flows in src and dest different struct fieldstat *instance_src = test_push_flows(flows_in_src, K); - vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(10000, K + 50); + vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(30000, K + 50); struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K); fieldstat_merge(instance_dest, instance_src); @@ -463,7 +463,7 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_m flows_in_dest.insert(flows_in_dest.end(), std::make_move_iterator(flows_in_src.begin()), std::make_move_iterator(flows_in_src.end())); double accuracy = test_cal_accuracy_given_expected_key(flows_in_dest, flows_in_merged); - EXPECT_GE(accuracy, 0.87); // by heavy keeper benchmark, with K = 100, merging result should be about 0.96, for adding the flows will also cause some inaccuracy, so here we set 0.93 + EXPECT_GE(accuracy, 0.87); printf("merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add accuracy is %lf\n", accuracy); fieldstat_free(instance_src); |
