summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author郑超 <[email protected]>2024-03-21 11:55:26 +0000
committer郑超 <[email protected]>2024-03-21 11:55:26 +0000
commit7eebfad2cf995da5ebe5fee908a3c911c1ce730a (patch)
treea5444c31a1279797c7d236b5e756e8f907b1c87b
parent6b49d228c93be4335aa95cd70a16c3517496554d (diff)
STHLL: The linear counting for small cardinalities should not ignore 1/8 of lowest registers.
-rw-r--r--CRDT/cm_sketch.h1
-rw-r--r--CRDT/probabilistic_crdt_gtest.cpp46
-rw-r--r--CRDT/st_hyperloglog.c30
-rw-r--r--CRDT/tb_crdt_gtest.cpp2
-rw-r--r--readme.md6
5 files changed, 64 insertions, 21 deletions
diff --git a/CRDT/cm_sketch.h b/CRDT/cm_sketch.h
index c106c58..38d77c3 100644
--- a/CRDT/cm_sketch.h
+++ b/CRDT/cm_sketch.h
@@ -4,6 +4,7 @@
* Count-Min Sketch is a probabilistic data-structure that takes sub linear space to store the probable count,
* or frequency, of occurrences of elements added into the data-structure.
+* Most of the code is from https://github.com/barrust/count-min-sketch.
*/
#pragma once
diff --git a/CRDT/probabilistic_crdt_gtest.cpp b/CRDT/probabilistic_crdt_gtest.cpp
index 9c8bf14..8fd4de0 100644
--- a/CRDT/probabilistic_crdt_gtest.cpp
+++ b/CRDT/probabilistic_crdt_gtest.cpp
@@ -440,7 +440,7 @@ TEST(STHyperLogLog, VariousWindow70k)
success=st_hll_case_print(st_case, n_case);
EXPECT_EQ(success, n_case);
}
-TEST(STHyperLogLog, Debug)
+TEST(STHyperLogLog, LinearCounting)
{
struct st_hll_case st_case[64];
int i=0;
@@ -488,6 +488,20 @@ TEST(STHyperLogLog, Debug)
int success=st_hll_case_print(st_case, i);
EXPECT_EQ(success+1, i);
}
+TEST(STHyperLogLog, Debug)
+{
+ struct st_hll_case st_case[64];
+ int i=0;
+ st_hll_case_init(st_case+i);
+ st_case[i].precision=14;
+ st_case[i].time_window_ms=300;
+ st_case[i].ideal_count=40000;
+ st_case[i].n_replica=1;
+ st_case[i].est_count=st_hll_test(st_case+i);
+ i++;
+ int success=st_hll_case_print(st_case, i);
+ EXPECT_EQ(success, i);
+}
TEST(STHyperLogLog, Replicas)
{
@@ -686,7 +700,35 @@ TEST(STHyperLogLog, Step)
ST_hyperloglog_free(h);
return;
}
-
+TEST(STHyperLogLog, Microscope)
+{
+ const char *key[2]={"192.168.41.12", "192.168.41.11"};
+ struct timeval start, step, now;
+ start.tv_sec=1000;
+ start.tv_usec=0;
+ memcpy(&now, &start, sizeof(now));
+ step.tv_sec=0;
+ step.tv_usec=10*1000;
+ struct ST_hyperloglog *h=ST_hyperloglog_new(7, 1000, start);
+ double estimated=0;
+ int passed=0, counted=0;
+ while(now.tv_sec-start.tv_sec<10)
+ {
+ timeradd(&now, &step, &now);
+ for(int i=0; i<2; i++)
+ {
+ ST_hyperloglog_add(h, key[i], strlen(key[i]), now);
+ }
+ estimated=ST_hyperloglog_count(h, now);
+ if(abs(estimated-2)<0.1)
+ {
+ passed++;
+ }
+ counted++;
+ }
+ EXPECT_EQ(passed, counted);
+ ST_hyperloglog_free(h);
+}
void ap_bloom_sync(struct AP_bloom *list[], size_t n)
{
char *blob=NULL;
diff --git a/CRDT/st_hyperloglog.c b/CRDT/st_hyperloglog.c
index 93bbb6e..70abf32 100644
--- a/CRDT/st_hyperloglog.c
+++ b/CRDT/st_hyperloglog.c
@@ -256,7 +256,7 @@ void a(void)
*/
// Computes the raw cardinality estimate
-static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *effective_reg)
+static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero)
{
unsigned char precision = h->cfg.precision;
int m = NUM_REG(precision);
@@ -265,31 +265,29 @@ static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *e
double lambda=0;
double harmonic_mean=0;
*num_zero=0;
- *effective_reg=0;
+ int effective_reg=0;
for (int i=0; i < m; i++)
{
- Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1;
- //neglect the 1/8 of the lowest registers to reduce the variance in the final evaluation.
- if( Wi < m/8 && h->cfg.time_window_ms)
- {
- continue;
- }
- (*effective_reg)++;
+ Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1;
Mi = get_register(h->registers, i);
if(!Mi)
*num_zero += 1;
+ //STHLL++ neglects the 1/8 of the lowest registers to reduce the variance in the final evaluation.
+ if(Wi < m/8 && h->cfg.time_window_ms)
+ {
+ continue;
+ }
+ effective_reg++;
lambda=pow(2.0, Mi-1)*m*m/Wi;
harmonic_mean += 1/lambda;
}
- harmonic_mean= (*effective_reg)/harmonic_mean;
+ harmonic_mean = effective_reg/harmonic_mean;
return harmonic_mean;
}
-
static double st_hll_count(const struct ST_hyperloglog *h)
{
-
- int num_zero=0, effective_reg=0;
- double raw_est=raw_estimate(h, &num_zero, &effective_reg);
+ int num_zero=0;
+ double raw_est=raw_estimate(h, &num_zero);
raw_est*=alpha(h->cfg.precision);
double est=0;
double num_reg=NUM_REG(h->cfg.precision);
@@ -300,11 +298,10 @@ static double st_hll_count(const struct ST_hyperloglog *h)
{
if(num_zero)
{
- est=linear_count(effective_reg, num_zero);
+ est=linear_count(num_reg, num_zero);
assert(est>=0);
return est;
}
-
}
if(raw_est>INT32_MAX/30)
{
@@ -524,7 +521,6 @@ double ST_hyperloglog_count(const struct ST_hyperloglog *h, struct timeval now)
est=st_hll_count(h);
assert(est>=0);
}
- assert(est>=0);
return est;
}
diff --git a/CRDT/tb_crdt_gtest.cpp b/CRDT/tb_crdt_gtest.cpp
index d38b356..c409294 100644
--- a/CRDT/tb_crdt_gtest.cpp
+++ b/CRDT/tb_crdt_gtest.cpp
@@ -1489,8 +1489,8 @@ void bulk_token_bucket_test(struct btb_case *mycase)
}
index=1-sqrt(index/key_num);
struct bulk_token_bucket_info info;
- EXPECT_NEAR(info.approximate_keys, key_num, key_num/5);
bulk_token_bucket_info(btb[0], now, &info);
+ EXPECT_NEAR(info.approximate_keys, key_num, key_num/5);
mycase->index=index;
mycase->approximate_keys=info.approximate_keys;
mycase->more=more;
diff --git a/readme.md b/readme.md
index 0b67a06..7a0c9b8 100644
--- a/readme.md
+++ b/readme.md
@@ -28,7 +28,11 @@ SwarmKV Data Types
- Integer
- Set
- Hash Table
-- Token Bucket, Fair Token Bucket and Bulk Token Bucket
+- Token Buckets
+ - Generic Token Buckets
+ - Fair Token Bucket: Implements stochastic fairness allocation to ensure equitable resource distribution.
+ - Bulk Token Bucket: Optimized for scenarios requiring a large number of token buckets with identical configurations.
+- Bloom Filter with the ability to expire.
- Count-min Sketch [Todo]
- HyperLogLog [Todo]