summaryrefslogtreecommitdiff
path: root/test/test_merge.cpp
diff options
context:
space:
mode:
authorchenzizhan <[email protected]>2024-07-11 16:14:09 +0800
committerchenzizhan <[email protected]>2024-07-11 16:14:09 +0800
commit5dc3d8a96bb203abc1ee050cd0c884f2ab989dba (patch)
tree38f5bc67522843c1cca51a30e413e4f8f9d1834e /test/test_merge.cpp
parent677f337e195e3b9b6e416109df8d51c14da2791b (diff)
spread sketch merge, reset
Diffstat (limited to 'test/test_merge.cpp')
-rw-r--r--test/test_merge.cpp244
1 files changed, 234 insertions, 10 deletions
diff --git a/test/test_merge.cpp b/test/test_merge.cpp
index e51b90d..7a07baf 100644
--- a/test/test_merge.cpp
+++ b/test/test_merge.cpp
@@ -2,6 +2,7 @@
#include <gtest/gtest.h>
#include <set>
#include <unordered_map>
+#include <unordered_set>
#include "fieldstat.h"
#include "utils.hpp"
@@ -38,6 +39,13 @@ long long merge_test_fieldstat_counter_get(const struct fieldstat *instance, int
return ret;
}
+double merge_test_fieldstat_hll_get(const struct fieldstat *instance, int cube_id, int metric_id, const struct field_list *tag_list = &TEST_TAG_LIST_STRING)
+{
+ double ret = 0;
+ fieldstat_hll_get(instance, cube_id, tag_list, metric_id, &ret);
+ return ret;
+}
+
TEST(unit_test_merge, test_metric_name_mapping_by_adding_metric_to_dest)
{
struct fieldstat *instance = fieldstat_new();
@@ -362,7 +370,7 @@ TEST(unit_test_merge, new_too_many_cells_on_one_metric_given_source_cube_reset_a
fieldstat_tag_list_arr_free(tag_list, n_cell);
}
-struct fieldstat *test_push_flows(vector<Fieldstat_tag_list_wrapper *> &flows_in_test, int K, long long count = 1)
+struct fieldstat *topk_test_push_flows(vector<Fieldstat_tag_list_wrapper *> &flows_in_test, int K, long long count = 1)
{
struct fieldstat *instance = fieldstat_new();
int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_TOPK, K);
@@ -377,9 +385,9 @@ TEST(unit_test_merge, merge_accuracy_test_with_K_large_enough_topk)
{
int K = 100;
vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(K, K);
- struct fieldstat *instance_src = test_push_flows(flows_in_src, K);
+ struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K);
vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(K, K);
- struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K);
+ struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K);
fieldstat_merge(instance_dest, instance_src);
struct field_list *tag_list = NULL;
@@ -406,17 +414,17 @@ TEST(unit_test_merge, merge_accuracy_test_with_K_large_enough_topk)
}
}
-TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_flows_larger)
+TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_flows_larger_topk)
{
int K = 1000;
vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(10000, K);
- struct fieldstat *instance_src = test_push_flows(flows_in_src, K, 1000); // 1000 times larger than dest 1
+ struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K, 1000); // 1000 times larger than dest 1
vector<Fieldstat_tag_list_wrapper *> flows_in_dest;
for (int i = 0; i < K; i++) {
Fieldstat_tag_list_wrapper *tmp = new Fieldstat_tag_list_wrapper("flows in dest", to_string(i).c_str());
flows_in_dest.push_back(tmp);
}
- struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K, 1);
+ struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K, 1);
fieldstat_merge(instance_dest, instance_src);
@@ -444,13 +452,13 @@ TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_all_inserted_given_src_f
}
}
-TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add)
+TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add_topk)
{
int K = 100;
vector<Fieldstat_tag_list_wrapper *> flows_in_src = test_gen_topk_flows(30000, K + 50); // let elephant flows in src and dest different
- struct fieldstat *instance_src = test_push_flows(flows_in_src, K);
+ struct fieldstat *instance_src = topk_test_push_flows(flows_in_src, K);
vector<Fieldstat_tag_list_wrapper *> flows_in_dest = test_gen_topk_flows(30000, K + 50);
- struct fieldstat *instance_dest = test_push_flows(flows_in_dest, K);
+ struct fieldstat *instance_dest = topk_test_push_flows(flows_in_dest, K);
fieldstat_merge(instance_dest, instance_src);
struct field_list *tag_list = NULL;
@@ -523,11 +531,227 @@ TEST(unit_test_merge, primary_metric_id_different)
fieldstat_free(instance_dst);
}
+TEST(unit_test_merge, new_cube_and_metric_to_empty_spreadsketch) {
+ struct fieldstat *instance = fieldstat_new();
+ fieldstat_create_cube(instance, &TEST_TAG_INT, 1, SAMPLING_MODE_SPREADSKETCH, 10);
+ fieldstat_register_hll(instance, "metric", 6);
+
+ struct fieldstat *instance_dest = fieldstat_new();
+ fieldstat_merge(instance_dest, instance);
+
+ int *cube_id_dest;
+ int n_cube;
+ fieldstat_get_cubes(instance_dest, &cube_id_dest, &n_cube);
+ EXPECT_TRUE(n_cube == 1);
+ EXPECT_STREQ(fieldstat_get_metric_name(instance_dest, 0), "metric");
+
+ free(cube_id_dest);
+ fieldstat_free(instance);
+ fieldstat_free(instance_dest);
+}
+
+TEST(unit_test_merge, new_cell_on_existing_cube_and_metric_spreadsketch) {
+ struct fieldstat *instance = fieldstat_new();
+ int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 10);
+ int metric_id = fieldstat_register_hll(instance, "metric", 6);
+ struct fieldstat *instance_dest = fieldstat_new();
+ fieldstat_merge(instance_dest, instance);
+
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "2", 1);
+ fieldstat_merge(instance_dest, instance);
+
+ int *cube_id_dest;
+ int n_cube;
+ fieldstat_get_cubes(instance_dest, &cube_id_dest, &n_cube);
+ EXPECT_TRUE(n_cube == 1);
+ free(cube_id_dest);
+ EXPECT_STREQ(fieldstat_get_metric_name(instance_dest, 0), "metric");
+ long long measure = merge_test_fieldstat_hll_get(instance, cube_id, metric_id);
+ EXPECT_NEAR(measure, 2, 0.3);
+
+ struct field_list *tag_list = NULL;
+ size_t n_cell = 0;
+ fieldstat_cube_get_cells(instance, cube_id, &tag_list, &n_cell);
+ EXPECT_EQ(n_cell, 1);
+ EXPECT_EQ(tag_list->n_field, 1);
+ EXPECT_STREQ(tag_list->field[0].key, TEST_TAG_STRING.key);
+
+ fieldstat_free(instance);
+ fieldstat_free(instance_dest);
+ fieldstat_tag_list_arr_free(tag_list, n_cell);
+}
+
+TEST(unit_test_merge, merge_existing_cell_on_existing_cube_and_metric_spreadsketch) {
+ struct fieldstat *instance = fieldstat_new();
+ int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 10);
+ int metric_id = fieldstat_register_hll(instance, "metric", 6);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1);
+ struct fieldstat *instance_dest = fieldstat_new();
+
+ fieldstat_merge(instance_dest, instance);
+ fieldstat_merge(instance_dest, instance);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "2", 1);
+ fieldstat_merge(instance_dest, instance);
+
+ struct field_list *tag_list = NULL;
+ size_t n_cell = 0;
+ fieldstat_cube_get_cells(instance_dest, cube_id, &tag_list, &n_cell);
+ EXPECT_EQ(n_cell, 1);
+ double value = merge_test_fieldstat_hll_get(instance_dest, cube_id, metric_id, &tag_list[0]);
+ EXPECT_NEAR(value, 2, 0.3);
+
+ fieldstat_free(instance);
+ fieldstat_free(instance_dest);
+ fieldstat_tag_list_arr_free(tag_list, n_cell);
+}
+
+TEST(unit_test_merge, new_too_many_cells_on_one_metric_given_source_cube_reset_and_get_different_cube_spreadsketch) {
+ struct fieldstat *instance = fieldstat_new();
+ int cube_id = fieldstat_create_cube(instance, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, 2);
+ int metric_id = fieldstat_register_hll(instance, "metric", 6);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_STRING, 1, "1", 1);
+ struct fieldstat *instance_dest = fieldstat_new();
+ fieldstat_merge(instance_dest, instance);
+
+ fieldstat_reset(instance);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_INT, 1, "21", 2);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_INT, 1, "22", 2);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "31", 2);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "32", 2);
+ fieldstat_hll_add(instance, cube_id, metric_id, &TEST_TAG_DOUBLE, 1, "33", 2);
+ fieldstat_merge(instance_dest, instance);
+
+ struct field_list *tag_list = NULL;
+ size_t n_cell = 0;
+ fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell);
+ EXPECT_EQ(n_cell, 2);
+ EXPECT_NEAR(merge_test_fieldstat_hll_get(instance_dest, 0, 0, &tag_list[0]), 3, 0.3);
+ EXPECT_NEAR(merge_test_fieldstat_hll_get(instance_dest, 0, 0, &tag_list[1]), 2, 0.3);
+ EXPECT_STREQ(tag_list[0].field[0].key, TEST_TAG_DOUBLE.key);
+ EXPECT_STREQ(tag_list[1].field[0].key, TEST_TAG_INT.key);
+
+ fieldstat_free(instance);
+ fieldstat_free(instance_dest);
+ fieldstat_tag_list_arr_free(tag_list, n_cell);
+}
+
+TEST(unit_test_merge, gen_dest_full_all_src_inserted_given_src_flows_larger_spreadsketch) {
+ int K = 100;
+ SpreadSketchZipfGenerator flow_generator(1.0, K); // exactly the number of cells, so there will be almost all(in case of hash collision happen) cells added successfully
+ struct fieldstat *instance_src = fieldstat_new();
+ int cube_id = fieldstat_create_cube(instance_src, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, K);
+ int metric_id = fieldstat_register_hll(instance_src, "metric", 6);
+ struct fieldstat *instance_dest = fieldstat_fork(instance_src);
+ const char dest_key[] = "key of dest";
+ const char src_key[] = "key of src";
+
+ std::unordered_map<std::string, std::unordered_set<std::string>> flow_cnt;
+ for (int i = 0; i < 500000; i++) { // add more, so the fanout of any flow to src instance is more than dest
+ Flow flow = flow_generator.next();
+ Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(src_key, flow.src_ip.c_str());
+ Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str());
+ fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count());
+
+ flow_cnt[dimension.to_string()].insert(item.to_string());
+ }
+
+ for (int i = 0; i < 1000; i++) {
+ Flow flow = flow_generator.next();
+ Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(dest_key, flow.src_ip.c_str());
+ Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str());
+ fieldstat_hll_add_field(instance_dest, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count());
+
+ flow_cnt[dimension.to_string()].insert(item.to_string());
+ }
+
+ fieldstat_merge(instance_dest, instance_src);
+
+ struct field_list *tag_list = NULL;
+ struct field_list *tag_list_src = NULL;
+ size_t n_cell = 0;
+ size_t n_cell_src = 0;
+ std::vector<struct Fieldstat_tag_list_wrapper *> test_result;
+ fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell);
+ fieldstat_cube_get_cells(instance_src, 0, &tag_list_src, &n_cell_src);
+ for (size_t i = 0; i < n_cell; i++) {
+ test_result.push_back(new Fieldstat_tag_list_wrapper(&tag_list[i]));
+ }
+ std::unordered_map<std::string, int> expected_unique_cnt;
+ for (auto &kv : flow_cnt) {
+ expected_unique_cnt[kv.first] = kv.second.size();
+ }
+
+ double recall = test_cal_topk_accuracy(test_result, expected_unique_cnt);
+ EXPECT_NEAR(recall, n_cell_src * 1.0 / n_cell, 0.0001); // the false positive is only generated because some cells in src are left because of hash collision
+
+ fieldstat_free(instance_src);
+ fieldstat_free(instance_dest);
+ fieldstat_tag_list_arr_free(tag_list, n_cell);
+ fieldstat_tag_list_arr_free(tag_list_src, n_cell_src);
+ for (size_t i = 0; i < test_result.size(); i++) {
+ delete test_result[i];
+ }
+}
+
+TEST(unit_test_merge, merge_accuracy_test_gen_dest_full_some_inserted_and_some_merged_and_some_fail_to_add_spreadsketch) {
+ int K = 100;
+ SpreadSketchZipfGenerator flow_generator(1.0, K); // exactly the number of cells, so there will be almost all(in case of hash collision happen) cells added successfully
+ struct fieldstat *instance_src = fieldstat_new();
+ int cube_id = fieldstat_create_cube(instance_src, &TEST_SHARED_TAG, 1, SAMPLING_MODE_SPREADSKETCH, K);
+ int metric_id = fieldstat_register_hll(instance_src, "metric", 6);
+ struct fieldstat *instance_dest = fieldstat_fork(instance_src);
+
+ std::unordered_map<std::string, std::unordered_set<std::string>> flow_cnt;
+ for (int i = 0; i < 100000; i++) {
+ Flow flow = flow_generator.next();
+ const char *use_key = rand()%2? "src":"common";
+ Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(use_key, flow.src_ip.c_str());
+ Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str());
+ fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count());
+
+ flow_cnt[dimension.to_string()].insert(item.to_string());
+ }
+ for (int i = 0; i < 100000; i++) {
+ Flow flow = flow_generator.next();
+ const char *use_key = rand()%2? "dest":"common";
+ Fieldstat_tag_list_wrapper dimension = Fieldstat_tag_list_wrapper(use_key, flow.src_ip.c_str());
+ Fieldstat_tag_list_wrapper item = Fieldstat_tag_list_wrapper("dummy", flow.dst_ip.c_str());
+ fieldstat_hll_add_field(instance_src, cube_id, metric_id, dimension.get_tag(), dimension.get_tag_count(), item.get_tag(), item.get_tag_count());
+
+ flow_cnt[dimension.to_string()].insert(item.to_string());
+ }
+
+ fieldstat_merge(instance_dest, instance_src);
+
+ struct field_list *tag_list = NULL;
+ size_t n_cell = 0;
+ std::vector<struct Fieldstat_tag_list_wrapper *> test_result;
+ fieldstat_cube_get_cells(instance_dest, 0, &tag_list, &n_cell);
+ for (size_t i = 0; i < n_cell; i++) {
+ test_result.push_back(new Fieldstat_tag_list_wrapper(&tag_list[i]));
+ }
+
+ std::unordered_map<std::string, int> expected_unique_cnt;
+ for (auto &kv : flow_cnt) {
+ expected_unique_cnt[kv.first] = kv.second.size();
+ }
+ double recall = test_cal_topk_accuracy(test_result, expected_unique_cnt);
+ EXPECT_GT(recall, 0.7);
+ printf("gen_dest_full_all_src_inserted_given_src_flows_larger_spreadsketch recall is %lf\n", recall);
+
+ fieldstat_free(instance_src);
+ fieldstat_free(instance_dest);
+ fieldstat_tag_list_arr_free(tag_list, n_cell);
+ for (size_t i = 0; i < test_result.size(); i++) {
+ delete test_result[i];
+ }
+}
int main(int argc, char *argv[])
{
testing::InitGoogleTest(&argc, argv);
- // testing::GTEST_FLAG(filter) = "unit_test_merge.merge_existing_cell_on_existing_cube_and_metric_topk";
+ testing::GTEST_FLAG(filter) = "*spreadsketch";
return RUN_ALL_TESTS();
} \ No newline at end of file