diff options
| author | 郑超 <[email protected]> | 2024-03-21 11:55:26 +0000 |
|---|---|---|
| committer | 郑超 <[email protected]> | 2024-03-21 11:55:26 +0000 |
| commit | 7eebfad2cf995da5ebe5fee908a3c911c1ce730a (patch) | |
| tree | a5444c31a1279797c7d236b5e756e8f907b1c87b | |
| parent | 6b49d228c93be4335aa95cd70a16c3517496554d (diff) | |
STHLL: The linear counting for small cardinalities should not ignore 1/8 of lowest registers.
| -rw-r--r-- | CRDT/cm_sketch.h | 1 | ||||
| -rw-r--r-- | CRDT/probabilistic_crdt_gtest.cpp | 46 | ||||
| -rw-r--r-- | CRDT/st_hyperloglog.c | 30 | ||||
| -rw-r--r-- | CRDT/tb_crdt_gtest.cpp | 2 | ||||
| -rw-r--r-- | readme.md | 6 |
5 files changed, 64 insertions, 21 deletions
diff --git a/CRDT/cm_sketch.h b/CRDT/cm_sketch.h index c106c58..38d77c3 100644 --- a/CRDT/cm_sketch.h +++ b/CRDT/cm_sketch.h @@ -4,6 +4,7 @@ * Author: [email protected] * Count-Min Sketch is a probabilistic data-structure that takes sub linear space to store the probable count, * or frequency, of occurrences of elements added into the data-structure. +* Most of the code is from https://github.com/barrust/count-min-sketch. */ #pragma once diff --git a/CRDT/probabilistic_crdt_gtest.cpp b/CRDT/probabilistic_crdt_gtest.cpp index 9c8bf14..8fd4de0 100644 --- a/CRDT/probabilistic_crdt_gtest.cpp +++ b/CRDT/probabilistic_crdt_gtest.cpp @@ -440,7 +440,7 @@ TEST(STHyperLogLog, VariousWindow70k) success=st_hll_case_print(st_case, n_case); EXPECT_EQ(success, n_case); } -TEST(STHyperLogLog, Debug) +TEST(STHyperLogLog, LinearCounting) { struct st_hll_case st_case[64]; int i=0; @@ -488,6 +488,20 @@ TEST(STHyperLogLog, Debug) int success=st_hll_case_print(st_case, i); EXPECT_EQ(success+1, i); } +TEST(STHyperLogLog, Debug) +{ + struct st_hll_case st_case[64]; + int i=0; + st_hll_case_init(st_case+i); + st_case[i].precision=14; + st_case[i].time_window_ms=300; + st_case[i].ideal_count=40000; + st_case[i].n_replica=1; + st_case[i].est_count=st_hll_test(st_case+i); + i++; + int success=st_hll_case_print(st_case, i); + EXPECT_EQ(success, i); +} TEST(STHyperLogLog, Replicas) { @@ -686,7 +700,35 @@ TEST(STHyperLogLog, Step) ST_hyperloglog_free(h); return; } - +TEST(STHyperLogLog, Microscope) +{ + const char *key[2]={"192.168.41.12", "192.168.41.11"}; + struct timeval start, step, now; + start.tv_sec=1000; + start.tv_usec=0; + memcpy(&now, &start, sizeof(now)); + step.tv_sec=0; + step.tv_usec=10*1000; + struct ST_hyperloglog *h=ST_hyperloglog_new(7, 1000, start); + double estimated=0; + int passed=0, counted=0; + while(now.tv_sec-start.tv_sec<10) + { + timeradd(&now, &step, &now); + for(int i=0; i<2; i++) + { + ST_hyperloglog_add(h, key[i], strlen(key[i]), now); + } + estimated=ST_hyperloglog_count(h, now); + if(abs(estimated-2)<0.1) + { + passed++; + } + counted++; + } + EXPECT_EQ(passed, counted); + ST_hyperloglog_free(h); +} void ap_bloom_sync(struct AP_bloom *list[], size_t n) { char *blob=NULL; diff --git a/CRDT/st_hyperloglog.c b/CRDT/st_hyperloglog.c index 93bbb6e..70abf32 100644 --- a/CRDT/st_hyperloglog.c +++ b/CRDT/st_hyperloglog.c @@ -256,7 +256,7 @@ void a(void) */ // Computes the raw cardinality estimate -static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *effective_reg) +static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero) { unsigned char precision = h->cfg.precision; int m = NUM_REG(precision); @@ -265,31 +265,29 @@ static double raw_estimate(const struct ST_hyperloglog *h, int *num_zero, int *e double lambda=0; double harmonic_mean=0; *num_zero=0; - *effective_reg=0; + int effective_reg=0; for (int i=0; i < m; i++) { - Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1; - //neglect the 1/8 of the lowest registers to reduce the variance in the final evaluation. - if( Wi < m/8 && h->cfg.time_window_ms) - { - continue; - } - (*effective_reg)++; + Wi=(i>=h->reset_idx)?(m+h->reset_idx-i):(h->reset_idx-i)+1; Mi = get_register(h->registers, i); if(!Mi) *num_zero += 1; + //STHLL++ neglects the 1/8 of the lowest registers to reduce the variance in the final evaluation. + if(Wi < m/8 && h->cfg.time_window_ms) + { + continue; + } + effective_reg++; lambda=pow(2.0, Mi-1)*m*m/Wi; harmonic_mean += 1/lambda; } - harmonic_mean= (*effective_reg)/harmonic_mean; + harmonic_mean = effective_reg/harmonic_mean; return harmonic_mean; } - static double st_hll_count(const struct ST_hyperloglog *h) { - - int num_zero=0, effective_reg=0; - double raw_est=raw_estimate(h, &num_zero, &effective_reg); + int num_zero=0; + double raw_est=raw_estimate(h, &num_zero); raw_est*=alpha(h->cfg.precision); double est=0; double num_reg=NUM_REG(h->cfg.precision); @@ -300,11 +298,10 @@ static double st_hll_count(const struct ST_hyperloglog *h) { if(num_zero) { - est=linear_count(effective_reg, num_zero); + est=linear_count(num_reg, num_zero); assert(est>=0); return est; } - } if(raw_est>INT32_MAX/30) { @@ -524,7 +521,6 @@ double ST_hyperloglog_count(const struct ST_hyperloglog *h, struct timeval now) est=st_hll_count(h); assert(est>=0); } - assert(est>=0); return est; } diff --git a/CRDT/tb_crdt_gtest.cpp b/CRDT/tb_crdt_gtest.cpp index d38b356..c409294 100644 --- a/CRDT/tb_crdt_gtest.cpp +++ b/CRDT/tb_crdt_gtest.cpp @@ -1489,8 +1489,8 @@ void bulk_token_bucket_test(struct btb_case *mycase) } index=1-sqrt(index/key_num); struct bulk_token_bucket_info info; - EXPECT_NEAR(info.approximate_keys, key_num, key_num/5); bulk_token_bucket_info(btb[0], now, &info); + EXPECT_NEAR(info.approximate_keys, key_num, key_num/5); mycase->index=index; mycase->approximate_keys=info.approximate_keys; mycase->more=more; @@ -28,7 +28,11 @@ SwarmKV Data Types - Integer - Set - Hash Table -- Token Bucket, Fair Token Bucket and Bulk Token Bucket +- Token Buckets + - Generic Token Buckets + - Fair Token Bucket: Implements stochastic fairness allocation to ensure equitable resource distribution. + - Bulk Token Bucket: Optimized for scenarios requiring a large number of token buckets with identical configurations. +- Bloom Filter with the ability to expire. - Count-min Sketch [Todo] - HyperLogLog [Todo] |
