spread sketch merge, reset

author: chenzizhan <[email protected]> 2024-07-11 16:14:09 +0800
committer: chenzizhan <[email protected]> 2024-07-11 16:14:09 +0800
commit: 5dc3d8a96bb203abc1ee050cd0c884f2ab989dba (patch)
tree: 38f5bc67522843c1cca51a30e413e4f8f9d1834e /test/test_metric_hll.cpp
parent: 677f337e195e3b9b6e416109df8d51c14da2791b (diff)
1 files changed, 68 insertions, 0 deletions
diff --git a/test/test_metric_hll.cpp b/test/test_metric_hll.cpp
index 8e96266..7c38de2 100644
--- a/test/test_metric_hll.cpp
+++ b/test/test_metric_hll.cpp
@@ -1,4 +1,10 @@
 #include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <math.h>
+
 #include "fieldstat.h"
 #include "utils.hpp"
 
@@ -216,9 +222,71 @@ TEST(metric_test_hll, add_with_wrong_metric_id_expecting_fail)
     fieldstat_free(instance);
 }
 
+TEST(metric_test_hll, spread_sketch_add_and_test_accuracy)
+{
+    struct fieldstat *instance = fieldstat_new();
+    int K = 10;
+    fieldstat_create_cube(instance, &TEST_TAG_INT_collided, 1, SAMPLING_MODE_SPREADSKETCH, K);
+    fieldstat_register_hll(instance, "testss", 6);
+
+    int n_flows = 100000;
+    std::unordered_map<std::string, std::unordered_set<std::string>> flow_cnt;
+    SpreadSketchZipfGenerator generator(1.0, K * 10); // give much bigger distribution, so that we can test the accuracy
+    for (int i = 0; i < n_flows; i++)
+    {   
+        Flow f = generator.next();
+        Fieldstat_tag_list_wrapper dimension("src ip", f.src_ip.c_str());
+        Fieldstat_tag_list_wrapper counted("dst ip", f.dst_ip.c_str());
+
+        fieldstat_hll_add_field(instance, 0, 0, dimension.get_tag(), dimension.get_tag_count(), counted.get_tag(), counted.get_tag_count());
+
+        flow_cnt[dimension.to_string()].insert(counted.to_string());
+    }
+
+    // recall
+    std::unordered_map<std::string, int> expected_unique_cnt;
+    std::vector<struct Fieldstat_tag_list_wrapper *> test_result;
+    for (auto &kv : flow_cnt) {
+        expected_unique_cnt[kv.first] = kv.second.size();
+    }
+
+    struct field_list *tag_list = NULL;
+    size_t n_cell = 0;
+    fieldstat_cube_get_cells(instance, 0, &tag_list, &n_cell);
+    EXPECT_EQ(n_cell, K);
+    for (size_t i = 0; i < n_cell; i++) {
+        Fieldstat_tag_list_wrapper tmp = Fieldstat_tag_list_wrapper(&tag_list[i]);
+        test_result.push_back(new Fieldstat_tag_list_wrapper(tmp));
+    }
+    double recall = test_cal_topk_accuracy(test_result, expected_unique_cnt);
+    printf("spread_sketch_add_and_test_accuracy recall: %f\n", recall);
+    EXPECT_GE(recall, 0.8);
+
+    // MRE
+    double mre = 0;
+    for (size_t i = 0; i < n_cell; i++) {
+        Fieldstat_tag_list_wrapper tmp = Fieldstat_tag_list_wrapper(&tag_list[i]);
+        double value_true = expected_unique_cnt[tmp.to_string()];
+        double value_est;
+        fieldstat_hll_get(instance, 0, &tag_list[i], 0, &value_est);
+        // printf("the estimated value for %s is %f, the true value is %f\n", tmp.to_string().c_str(), value_est, value_true);
+
+        mre += fabs(value_true - value_est) / value_true;
+    }
+    mre = mre / n_cell;
+    printf("topk_add_and_test_accuracy Mean ratio e: %f\n", mre);
+    EXPECT_LE(mre, 0.2);
+
+    fieldstat_tag_list_arr_free(tag_list, n_cell);
+    fieldstat_free(instance);
+    for (auto &ptr : test_result) {
+        delete ptr;
+    }
+}
 
 int main(int argc, char *argv[]) 
 {
 	testing::InitGoogleTest(&argc, argv);
+    // testing::GTEST_FLAG(filter) = "metric_test_hll.spread_sketch_add_and_test_accuracy";
 	return RUN_ALL_TESTS();
 }
author	chenzizhan <[email protected]>	2024-07-11 16:14:09 +0800
committer	chenzizhan <[email protected]>	2024-07-11 16:14:09 +0800
commit	5dc3d8a96bb203abc1ee050cd0c884f2ab989dba (patch)
tree	38f5bc67522843c1cca51a30e413e4f8f9d1834e /test/test_metric_hll.cpp
parent	677f337e195e3b9b6e416109df8d51c14da2791b (diff)