STHLL: The linear counting for small cardinalities should not ignore 1/8 of lowest registers.

author: 郑超 <[email protected]> 2024-03-21 11:55:26 +0000
committer: 郑超 <[email protected]> 2024-03-21 11:55:26 +0000
commit: 7eebfad2cf995da5ebe5fee908a3c911c1ce730a (patch)
tree: a5444c31a1279797c7d236b5e756e8f907b1c87b
parent: 6b49d228c93be4335aa95cd70a16c3517496554d (diff)
5 files changed, 64 insertions, 21 deletions
diff --git a/CRDT/cm_sketch.h b/CRDT/cm_sketch.h
index c106c58..38d77c3 100644
--- a/CRDT/cm_sketch.h
+++ b/CRDT/cm_sketch.h
@@ -4,6 +4,7 @@
 * Author: [email protected]
 * Count-Min Sketch is a probabilistic data-structure that takes sub linear space to store the probable count, 
 * or frequency, of occurrences of elements added into the data-structure.
+* Most of the code is from https://github.com/barrust/count-min-sketch.
 */
 
 #pragma once
diff --git a/CRDT/probabilistic_crdt_gtest.cpp b/CRDT/probabilistic_crdt_gtest.cpp
index 9c8bf14..8fd4de0 100644
--- a/CRDT/probabilistic_crdt_gtest.cpp
+++ b/CRDT/probabilistic_crdt_gtest.cpp
@@ -440,7 +440,7 @@ TEST(STHyperLogLog, VariousWindow70k)
 	success=st_hll_case_print(st_case, n_case);
 	EXPECT_EQ(success, n_case);
 }
-TEST(STHyperLogLog, Debug)
+TEST(STHyperLogLog, LinearCounting)
 {
 	struct st_hll_case st_case[64];
 	int i=0;
@@ -488,6 +488,20 @@ TEST(STHyperLogLog, Debug)
 	int success=st_hll_case_print(st_case, i);
 	EXPECT_EQ(success+1, i);
 }
+TEST(STHyperLogLog, Debug)
+{
+	struct st_hll_case st_case[64];
+	int i=0;
+	st_hll_case_init(st_case+i);
+	st_case[i].precision=14;
+	st_case[i].time_window_ms=300;
+	st_case[i].ideal_count=40000;
+	st_case[i].n_replica=1;
+	st_case[i].est_count=st_hll_test(st_case+i);
+	i++;
+	int success=st_hll_case_print(st_case, i);
+	EXPECT_EQ(success, i);
+}
 TEST(STHyperLogLog, Replicas)
 {
 	
@@ -686,7 +700,35 @@ TEST(STHyperLogLog, Step)
 	ST_hyperloglog_free(h);
 	return;
 }
-
+TEST(STHyperLogLog, Microscope)
+{
+	const char *key[2]={"192.168.41.12", "192.168.41.11"};
+	struct timeval start, step, now;
+	start.tv_sec=1000;
+	start.tv_usec=0;
+	memcpy(&now, &start, sizeof(now));
+	step.tv_sec=0;
+	step.tv_usec=10*1000;
+	struct ST_hyperloglog *h=ST_hyperloglog_new(7, 1000, start);
+	double estimated=0;
+	int passed=0, counted=0;
+	while(now.tv_sec-start.tv_sec<10)
+	{
+		timeradd(&now, &step, &now);
+		for(int i=0; i<2; i++)
+		{
+			ST_hyperloglog_add(h, key[i], strlen(key[i]), now);
+		}
+		estimated=ST_hyperloglog_count(h, now);
+		if(abs(estimated-2)<0.1)
+		{
+			passed++;
+		}
+		counted++;
+	}
+	EXPECT_EQ(passed, counted);
+	ST_hyperloglog_free(h);
+}
 void ap_bloom_sync(struct AP_bloom *list[], size_t n)
 {
 	char *blob=NULL;
diff --git a/CRDT/st_hyperloglog.c b/CRDT/st_hyperloglog.c
index 93bbb6e..70abf32 100644
--- a/CRDT/st_hyperloglog.c
+++ b/CRDT/st_hyperloglog.c
@@ -256,7 +256,7 @@ void a(void)
 */
 
 // Computes the raw cardinality estimate 
-static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *effective_reg)
+static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero)
 {
     unsigned char precision = h->cfg.precision;
     int m = NUM_REG(precision);
@@ -265,31 +265,29 @@ static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *e
     double lambda=0;
     double harmonic_mean=0;
     *num_zero=0;
-    *effective_reg=0;
+    int effective_reg=0;
     for (int i=0; i < m; i++) 
     {
-        Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1;
-        //neglect the 1/8 of the lowest registers to reduce the variance in the final evaluation.
-        if( Wi < m/8 && h->cfg.time_window_ms)
-        {
-            continue;
-        }
-        (*effective_reg)++;
+        Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1;  
         Mi = get_register(h->registers, i);
         if(!Mi)
             *num_zero += 1;
+        //STHLL++ neglects the 1/8 of the lowest registers to reduce the variance in the final evaluation.
+        if(Wi < m/8 && h->cfg.time_window_ms)
+        {
+            continue;
+        }
+        effective_reg++;
         lambda=pow(2.0, Mi-1)*m*m/Wi;
         harmonic_mean += 1/lambda;        
     }
-    harmonic_mean= (*effective_reg)/harmonic_mean;
+    harmonic_mean = effective_reg/harmonic_mean;
     return harmonic_mean;
 }
-
 static double st_hll_count(const struct ST_hyperloglog *h)
 {
-    
-    int num_zero=0, effective_reg=0;
-    double raw_est=raw_estimate(h, &num_zero, &effective_reg);
+    int num_zero=0;
+    double raw_est=raw_estimate(h, &num_zero);
     raw_est*=alpha(h->cfg.precision);
     double est=0;
     double num_reg=NUM_REG(h->cfg.precision);
@@ -300,11 +298,10 @@ static double st_hll_count(const struct ST_hyperloglog *h)
     {
         if(num_zero)
         {
-            est=linear_count(effective_reg, num_zero);
+            est=linear_count(num_reg, num_zero);
             assert(est>=0);
             return est;
         }
-            
     }
     if(raw_est>INT32_MAX/30)
     {
@@ -524,7 +521,6 @@ double ST_hyperloglog_count(const struct ST_hyperloglog *h, struct timeval now)
         est=st_hll_count(h);
         assert(est>=0);
     }
-    assert(est>=0);
     return est;
 }
 
diff --git a/CRDT/tb_crdt_gtest.cpp b/CRDT/tb_crdt_gtest.cpp
index d38b356..c409294 100644
--- a/CRDT/tb_crdt_gtest.cpp
+++ b/CRDT/tb_crdt_gtest.cpp
@@ -1489,8 +1489,8 @@ void bulk_token_bucket_test(struct btb_case *mycase)
 	}
 	index=1-sqrt(index/key_num);
 	struct bulk_token_bucket_info info;
-	EXPECT_NEAR(info.approximate_keys, key_num, key_num/5);
 	bulk_token_bucket_info(btb[0], now, &info);
+	EXPECT_NEAR(info.approximate_keys, key_num, key_num/5);
 	mycase->index=index;
 	mycase->approximate_keys=info.approximate_keys;
 	mycase->more=more;
diff --git a/readme.md b/readme.md
index 0b67a06..7a0c9b8 100644
--- a/readme.md
+++ b/readme.md
@@ -28,7 +28,11 @@ SwarmKV Data Types
 - Integer
 - Set
 - Hash Table
-- Token Bucket, Fair Token Bucket and Bulk Token Bucket
+- Token Buckets
+    - Generic Token Buckets
+    - Fair Token Bucket: Implements stochastic fairness allocation to ensure equitable resource distribution.
+    - Bulk Token Bucket: Optimized for scenarios requiring a large number of token buckets with identical configurations.
+- Bloom Filter with the ability to expire.
 - Count-min Sketch [Todo]
 - HyperLogLog [Todo]
author	郑超 <[email protected]>	2024-03-21 11:55:26 +0000
committer	郑超 <[email protected]>	2024-03-21 11:55:26 +0000
commit	7eebfad2cf995da5ebe5fee908a3c911c1ce730a (patch)
tree	a5444c31a1279797c7d236b5e756e8f907b1c87b
parent	6b49d228c93be4335aa95cd70a16c3517496554d (diff)