diff options
| author | chenzizhan <[email protected]> | 2024-07-11 16:14:09 +0800 |
|---|---|---|
| committer | chenzizhan <[email protected]> | 2024-07-11 16:14:09 +0800 |
| commit | 5dc3d8a96bb203abc1ee050cd0c884f2ab989dba (patch) | |
| tree | 38f5bc67522843c1cca51a30e413e4f8f9d1834e /test/test_merge.cpp | |
| parent | 677f337e195e3b9b6e416109df8d51c14da2791b (diff) | |
spread sketch merge, reset
Diffstat (limited to 'test/test_merge.cpp')
| -rw-r--r-- | test/test_merge.cpp | 244 |
1 files changed, 234 insertions, 10 deletions
diff --git a/test/test_merge.cpp b/test/test_merge.cpp index e51b90d..7a07baf 100644 --- a/test/test_merge.cpp +++ b/test/test_merge.cpp @@ -2,6 +2,7 @@ #include <gtest/gtest.h> #include <set> #include <unordered_map> +#include <unordered_set> #include "fieldstat.h" #include "utils.hpp" @@ -38,6 +39,13 @@ long long merge_test_fieldstat_counter_get(const struct fieldstat *instance, int return ret; } +double merge_test_fieldstat_hll_get(const struct fieldstat *instance, int cube_id, int metric_id, const struct field_list *tag_list = &TEST_TAG_LIST_STRING) +{ + double ret = 0; + fieldstat_hll_get(instance, cube_id, tag_list, metric_id, &ret); + return ret; +} + TEST(unit_test_merge, test_metric_name_mapping_by_adding_metric_to_dest) { struct fieldstat *instance = fieldstat_new(); @@ -362,7 +370,7 @@ TEST(unit_test_merge, new_too_many_cells_on_one_metric_given_source_cube_reset_a fieldstat_tag_list_arr_free(tag_list, n_cell); } -struct fieldstat *test_push_flows(vector<Fieldstat_tag_list_wrapper *> &flows_in_test, int K, long long count = 1) +struct fieldstat *topk_test_push_flows(vector<Fieldstat_tag_list_wrapper *> &flows_in_test, int K, long long count = 1) { struct fieldstat *instance = fieldstat_new(); int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_TOPK, K); @@ -377,9 +385,9 @@ TEST(unit_test_merge, merge_accuracy_test_with_K_large_enough_topk) { int K = 100; vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(K, K); - struct fieldstat *instance_src = test_push_flows(flows_in_src, K); + struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K); vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(K, K); - struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K); + struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K); fieldstat_merge(instance_dest, instance_src); struct field_list *tag_list = NULL; @@ -406,17 +414,17 @@ TEST(unit_test_merge, merge_accuracy_test_with_K_large_enough_topk) } } -TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_flows_larger) +TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_flows_larger_topk) { int K = 1000; vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(10000, K); - struct fieldstat *instance_src = test_push_flows(flows_in_src, K, 1000); // 1000 times larger than dest 1 + struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K, 1000); // 1000 times larger than dest 1 vector<Fieldstat_tag_list_wrapper *> flows_in_dest; for (int i = 0; i < K; i++) { Fieldstat_tag_list_wrapper *tmp = new Fieldstat_tag_list_wrapper("flows in dest", to_string(i).c_str()); flows_in_dest.push_back(tmp); } - struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K, 1); + struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K, 1); fieldstat_merge(instance_dest, instance_src); @@ -444,13 +452,13 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_f } } -TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add) +TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add_topk) { int K = 100; vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(30000, K + 50); // let elephant flows in src and dest different - struct fieldstat *instance_src = test_push_flows(flows_in_src, K); + struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K); vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(30000, K + 50); - struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K); + struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K); fieldstat_merge(instance_dest, instance_src); struct field_list *tag_list = NULL; @@ -523,11 +531,227 @@ TEST(unit_test_merge, primary_metric_id_different) fieldstat_free(instance_dst); } +TEST(unit_test_merge, new_cube_and_metric_to_empty_spreadsketch) { + struct fieldstat *instance = fieldstat_new(); + fieldstat_create_cube(instance, &TEST_TAG_INT, 1, SAMPLING_MODE_SPREADSKETCH, 10); + fieldstat_register_hll(instance, "metric", 6); + + struct fieldstat *instance_dest = fieldstat_new(); + fieldstat_merge(instance_dest, instance); + + int *cube_id_dest; + int n_cube; + fieldstat_get_cubes(instance_dest, &cube_id_dest, &n_cube); + EXPECT_TRUE(n_cube == 1); + EXPECT_STREQ(fieldstat_get_metric_name(instance_dest, 0), "metric"); + + free(cube_id_dest); + fieldstat_free(instance); + fieldstat_free(instance_dest); +} + +TEST(unit_test_merge, new_cell_on_existing_cube_and_metric_spreadsketch) { + struct fieldstat *instance = fieldstat_new(); + int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 10); + int metric_id = fieldstat_register_hll(instance, "metric", 6); + struct fieldstat *instance_dest = fieldstat_new(); + fieldstat_merge(instance_dest, instance); + + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "2", 1); + fieldstat_merge(instance_dest, instance); + + int *cube_id_dest; + int n_cube; + fieldstat_get_cubes(instance_dest, &cube_id_dest, &n_cube); + EXPECT_TRUE(n_cube == 1); + free(cube_id_dest); + EXPECT_STREQ(fieldstat_get_metric_name(instance_dest, 0), "metric"); + long long measure = merge_test_fieldstat_hll_get(instance, cube_id, metric_id); + EXPECT_NEAR(measure, 2, 0.3); + + struct field_list *tag_list = NULL; + size_t n_cell = 0; + fieldstat_cube_get_cells(instance, cube_id, &tag_list, &n_cell); + EXPECT_EQ(n_cell, 1); + EXPECT_EQ(tag_list->n_field, 1); + EXPECT_STREQ(tag_list->field[0].key, TEST_TAG_STRING.key); + + fieldstat_free(instance); + fieldstat_free(instance_dest); + fieldstat_tag_list_arr_free(tag_list, n_cell); +} + +TEST(unit_test_merge, merge_existing_cell_on_existing_cube_and_metric_spreadsketch) { + struct fieldstat *instance = fieldstat_new(); + int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 10); + int metric_id = fieldstat_register_hll(instance, "metric", 6); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1); + struct fieldstat *instance_dest = fieldstat_new(); + + fieldstat_merge(instance_dest, instance); + fieldstat_merge(instance_dest, instance); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "2", 1); + fieldstat_merge(instance_dest, instance); + + struct field_list *tag_list = NULL; + size_t n_cell = 0; + fieldstat_cube_get_cells(instance_dest, cube_id, &tag_list, &n_cell); + EXPECT_EQ(n_cell, 1); + double value = merge_test_fieldstat_hll_get(instance_dest, cube_id, metric_id, &tag_list[0]); + EXPECT_NEAR(value, 2, 0.3); + + fieldstat_free(instance); + fieldstat_free(instance_dest); + fieldstat_tag_list_arr_free(tag_list, n_cell); +} + +TEST(unit_test_merge, new_too_many_cells_on_one_metric_given_source_cube_reset_and_get_different_cube_spreadsketch) { + struct fieldstat *instance = fieldstat_new(); + int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 2); + int metric_id = fieldstat_register_hll(instance, "metric", 6); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1); + struct fieldstat *instance_dest = fieldstat_new(); + fieldstat_merge(instance_dest, instance); + + fieldstat_reset(instance); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_INT, 1, "21", 2); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_INT, 1, "22", 2); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "31", 2); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "32", 2); + fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "33", 2); + fieldstat_merge(instance_dest, instance); + + struct field_list *tag_list = NULL; + size_t n_cell = 0; + fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell); + EXPECT_EQ(n_cell, 2); + EXPECT_NEAR(merge_test_fieldstat_hll_get(instance_dest, 0, 0, &tag_list[0]), 3, 0.3); + EXPECT_NEAR(merge_test_fieldstat_hll_get(instance_dest, 0, 0, &tag_list[1]), 2, 0.3); + EXPECT_STREQ(tag_list[0].field[0].key, TEST_TAG_DOUBLE.key); + EXPECT_STREQ(tag_list[1].field[0].key, TEST_TAG_INT.key); + + fieldstat_free(instance); + fieldstat_free(instance_dest); + fieldstat_tag_list_arr_free(tag_list, n_cell); +} + +TEST(unit_test_merge, gen_dest_full_all_src_inserted_given_src_flows_larger_spreadsketch) { + int K = 100; + SpreadSketchZipfGenerator flow_generator(1.0, K); // exactly the number of cells, so there will be almost all(in case of hash collision happen) cells added successfully + struct fieldstat *instance_src = fieldstat_new(); + int cube_id = fieldstat_create_cube(instance_src, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, K); + int metric_id = fieldstat_register_hll(instance_src, "metric", 6); + struct fieldstat *instance_dest = fieldstat_fork(instance_src); + const char dest_key[] = "key of dest"; + const char src_key[] = "key of src"; + + std::unordered_map<std::string, std::unordered_set<std::string>> flow_cnt; + for (int i = 0; i < 500000; i++) { // add more, so the fanout of any flow to src instance is more than dest + Flow flow = flow_generator.next(); + Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(src_key, flow.src_ip.c_str()); + Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str()); + fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count()); + + flow_cnt[dimension.to_string()].insert(item.to_string()); + } + + for (int i = 0; i < 1000; i++) { + Flow flow = flow_generator.next(); + Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(dest_key, flow.src_ip.c_str()); + Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str()); + fieldstat_hll_add_field(instance_dest, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count()); + + flow_cnt[dimension.to_string()].insert(item.to_string()); + } + + fieldstat_merge(instance_dest, instance_src); + + struct field_list *tag_list = NULL; + struct field_list *tag_list_src = NULL; + size_t n_cell = 0; + size_t n_cell_src = 0; + std::vector<struct Fieldstat_tag_list_wrapper *> test_result; + fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell); + fieldstat_cube_get_cells(instance_src, 0, &tag_list_src, &n_cell_src); + for (size_t i = 0; i < n_cell; i++) { + test_result.push_back(new Fieldstat_tag_list_wrapper(&tag_list[i])); + } + std::unordered_map<std::string, int> expected_unique_cnt; + for (auto &kv : flow_cnt) { + expected_unique_cnt[kv.first] = kv.second.size(); + } + + double recall = test_cal_topk_accuracy(test_result, expected_unique_cnt); + EXPECT_NEAR(recall, n_cell_src * 1.0 / n_cell, 0.0001); // the false positive is only generated because some cells in src are left because of hash collision + + fieldstat_free(instance_src); + fieldstat_free(instance_dest); + fieldstat_tag_list_arr_free(tag_list, n_cell); + fieldstat_tag_list_arr_free(tag_list_src, n_cell_src); + for (size_t i = 0; i < test_result.size(); i++) { + delete test_result[i]; + } +} + +TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add_spreadsketch) { + int K = 100; + SpreadSketchZipfGenerator flow_generator(1.0, K); // exactly the number of cells, so there will be almost all(in case of hash collision happen) cells added successfully + struct fieldstat *instance_src = fieldstat_new(); + int cube_id = fieldstat_create_cube(instance_src, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, K); + int metric_id = fieldstat_register_hll(instance_src, "metric", 6); + struct fieldstat *instance_dest = fieldstat_fork(instance_src); + + std::unordered_map<std::string, std::unordered_set<std::string>> flow_cnt; + for (int i = 0; i < 100000; i++) { + Flow flow = flow_generator.next(); + const char *use_key = rand()%2? "src":"common"; + Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(use_key, flow.src_ip.c_str()); + Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str()); + fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count()); + + flow_cnt[dimension.to_string()].insert(item.to_string()); + } + for (int i = 0; i < 100000; i++) { + Flow flow = flow_generator.next(); + const char *use_key = rand()%2? "dest":"common"; + Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(use_key, flow.src_ip.c_str()); + Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str()); + fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count()); + + flow_cnt[dimension.to_string()].insert(item.to_string()); + } + + fieldstat_merge(instance_dest, instance_src); + + struct field_list *tag_list = NULL; + size_t n_cell = 0; + std::vector<struct Fieldstat_tag_list_wrapper *> test_result; + fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell); + for (size_t i = 0; i < n_cell; i++) { + test_result.push_back(new Fieldstat_tag_list_wrapper(&tag_list[i])); + } + + std::unordered_map<std::string, int> expected_unique_cnt; + for (auto &kv : flow_cnt) { + expected_unique_cnt[kv.first] = kv.second.size(); + } + double recall = test_cal_topk_accuracy(test_result, expected_unique_cnt); + EXPECT_GT(recall, 0.7); + printf("gen_dest_full_all_src_inserted_given_src_flows_larger_spreadsketch recall is %lf\n", recall); + + fieldstat_free(instance_src); + fieldstat_free(instance_dest); + fieldstat_tag_list_arr_free(tag_list, n_cell); + for (size_t i = 0; i < test_result.size(); i++) { + delete test_result[i]; + } +} int main(int argc, char *argv[]) { testing::InitGoogleTest(&argc, argv); - // testing::GTEST_FLAG(filter) = "unit_test_merge.merge_existing_cell_on_existing_cube_and_metric_topk"; + testing::GTEST_FLAG(filter) = "*spreadsketch"; return RUN_ALL_TESTS(); }
\ No newline at end of file |
