diff options
Diffstat (limited to 'bindings/rs-dablooms')
28 files changed, 3507 insertions, 0 deletions
diff --git a/bindings/rs-dablooms/.gitignore b/bindings/rs-dablooms/.gitignore new file mode 100644 index 0000000..a0fb2cd --- /dev/null +++ b/bindings/rs-dablooms/.gitignore @@ -0,0 +1,2 @@ +dablooms/build/ +/target diff --git a/bindings/rs-dablooms/Cargo.toml b/bindings/rs-dablooms/Cargo.toml new file mode 100644 index 0000000..65e0441 --- /dev/null +++ b/bindings/rs-dablooms/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "rs-dablooms" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2.0" +bindgen = "0.68.1"
\ No newline at end of file diff --git a/bindings/rs-dablooms/README.md b/bindings/rs-dablooms/README.md new file mode 100644 index 0000000..22f6a9b --- /dev/null +++ b/bindings/rs-dablooms/README.md @@ -0,0 +1,67 @@ +# README + +Rust binding for [dablooms](https://git.mesalab.cn/tango/dablooms). + +Provides 3 types of Bloom filters: CountingBloomFilter, ScalingBloomFilter, ExpiryBloomFilter. + +Each type of Bloom filter supports 3 basic operations: new, add, remove, and check. +- ExpiryBloomFilter does not provide remove, and relies on expiration time to clean up data. + +The key value is required to implement the `AsRef<[u8]>` trait. +- In Rust, a `&str` like "hello world" implements `AsRef<[u8]>` by default, so it can be used directly. + +Usage example + +```rust +// CountingBloomFilter +let mut bloom = CountingBloomFilter::new(1000, 0.01).unwrap(); +let key1 = "hello world"; + +assert!(bloom.check(key1) == false); +bloom.add(key1).unwrap(); +assert!(bloom.check(key) == true); +bloom.remove(key1); +assert!(bloom.check(key1) == false); + +// ScalingBloomFilter +let mut bloom = ScalingBloomFilter::new(100, 0.05).unwrap(); +let key1 = "aaa"; +let id1 = 1; + +assert!(bloom.check(key1) == false); +bloom.add(key1, id1).unwrap(); +assert!(bloom.check(key1) == true); +bloom.remove(key1, id1).unwrap(); +assert!(bloom.check(key1) == false); + +// ExpiryBloomFilter +let capacity: u32 = 100; +let error_rate: f64 = 0.05; +let expiry_secs: i32 = 3; // expiry time 3s +let key1 = "aaa"; + +let mut bloom = ExpiryBloomFilter::new(capacity, error_rate, expiry_secs).unwrap(); + +bloom.add(key1).unwrap(); +assert!(bloom.count() == 1); +assert!(bloom.check(key1)); + +unsafe { + sleep(2); // sleep 2 sec | all key1's value not expired +} +assert!(bloom.check(key1)); +unsafe { + sleep(2); // sleep 4 sec | all key1's value expired +} +assert!(!bloom.check(key1)); +``` + +## Oher API + +ScalingBloomFilter +- flush: +- mem_seqnum +- disk_seqnum + +ExpiryBloomFilter +- count diff --git a/bindings/rs-dablooms/README.zh_cn.md b/bindings/rs-dablooms/README.zh_cn.md new file mode 100644 index 0000000..a6a6e79 --- /dev/null +++ b/bindings/rs-dablooms/README.zh_cn.md @@ -0,0 +1,67 @@ +# README + +[dablooms](https://git.mesalab.cn/tango/dablooms) 的 Rust binding. + +提供了 3 种 布隆过滤器: CountingBloomFilter ScalingBloomFilter ExpiryBloomFilter + +每种布隆过滤器 分别支持 new add remove check 三种基本操作. +- ExpiryBloomFilter 不提供 remove, 其依靠 过期时间清理数据. + +key 值要求实现了 `AsRef<[u8]>` trait. +- rust 中类似 "hello world" 的 `&str` 默认实现了 `AsRef<[u8]>`,因此可以直接使用 + +使用示例 + +```rust +// CountingBloomFilter +let mut bloom = CountingBloomFilter::new(1000, 0.01).unwrap(); +let key1 = "hello world"; + +assert!(bloom.check(key1) == false); +bloom.add(key1).unwrap(); +assert!(bloom.check(key) == true); +bloom.remove(key1); +assert!(bloom.check(key1) == false); + +// ScalingBloomFilter +let mut bloom = ScalingBloomFilter::new(100, 0.05).unwrap(); +let key1 = "aaa"; +let id1 = 1; + +assert!(bloom.check(key1) == false); +bloom.add(key1, id1).unwrap(); +assert!(bloom.check(key1) == true); +bloom.remove(key1, id1).unwrap(); +assert!(bloom.check(key1) == false); + +// ExpiryBloomFilter +let capacity: u32 = 100; +let error_rate: f64 = 0.05; +let expiry_secs: i32 = 3; // expiry time 3s +let key1 = "aaa"; + +let mut bloom = ExpiryBloomFilter::new(capacity, error_rate, expiry_secs).unwrap(); + +bloom.add(key1).unwrap(); +assert!(bloom.count() == 1); +assert!(bloom.check(key1)); + +unsafe { + sleep(2); // sleep 2 sec | all key1's value not expired +} +assert!(bloom.check(key1)); +unsafe { + sleep(2); // sleep 4 sec | all key1's value expired +} +assert!(!bloom.check(key1)); +``` + +## 其他API + +ScalingBloomFilter +- flush: +- mem_seqnum +- disk_seqnum + +ExpiryBloomFilter +- count diff --git a/bindings/rs-dablooms/build.rs b/bindings/rs-dablooms/build.rs new file mode 100644 index 0000000..9d9f1d2 --- /dev/null +++ b/bindings/rs-dablooms/build.rs @@ -0,0 +1,37 @@ +use std::process::Command;
+
+fn main() {
+ // Compile C source file
+ let output = Command::new("make")
+ // .arg("so") // 生成动态库
+ .arg("install") // install
+ .current_dir("./dablooms")
+ .output()
+ .expect("Failed to compile C source file");
+
+ if !output.status.success() {
+ panic!(
+ "Failed to compile C source file: {}",
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+
+ let output = Command::new("bash")
+ .arg("-c")
+ .arg("ldconfig")
+ .output()
+ .expect("Failed to execute command");
+
+ if output.status.success() {
+ println!(
+ "Command output: {}",
+ String::from_utf8_lossy(&output.stdout)
+ );
+ } else {
+ println!("Command error: {}", String::from_utf8_lossy(&output.stderr));
+ }
+
+ /* set the library search path for marsio */
+ // println!("cargo:rustc-link-search=native=/opt/tsg/mrzcpd/znver1/lib/");
+ println!("cargo:rustc-link-lib=static=dablooms");
+}
diff --git a/bindings/rs-dablooms/dablooms/CMakeLists.txt b/bindings/rs-dablooms/dablooms/CMakeLists.txt new file mode 100644 index 0000000..f9010c4 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.10) +project(dabloom) +add_definitions(-fPIC) +add_library(libdabloom-static STATIC src/dablooms.c src/murmur.c) +add_library(libdabloom SHARED src/dablooms.c src/murmur.c)
\ No newline at end of file diff --git a/bindings/rs-dablooms/dablooms/LICENSE b/bindings/rs-dablooms/dablooms/LICENSE new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/LICENSE @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/bindings/rs-dablooms/dablooms/Makefile b/bindings/rs-dablooms/dablooms/Makefile new file mode 100644 index 0000000..51cec38 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/Makefile @@ -0,0 +1,179 @@ +HELPTEXT = "\ +\n dablooms Makefile usage \ +\n \ +\n Options (current value) \ +\n \ +\n BLDDIR ($(BLDDIR)) \ +\n DESTDIR ($(DESTDIR)) \ +\n prefix ($(prefix)) \ +\n libdir ($(libdir)) \ +\n includedir ($(includedir)) \ +\n \ +\n CC ($(CC)) \ +\n CFLAGS ($(ALL_CFLAGS)) \ +\n LDFLAGS ($(LDFLAGS)) \ +\n INSTALL ($(INSTALL)) \ +\n \ +\n PYTHON ($(PYTHON)) \ +\n PY_MOD_DIR ($(PY_MOD_DIR)) \ +\n \ +\n Targets \ +\n \ +\n all (c libdablooms) \ +\n install \ +\n test \ +\n clean \ +\n help \ +\n \ +\n\n" +# \n pydablooms \ +# \n install_pydablooms \ +# \n test_pydablooms \ +\n\n" + +prefix = /usr/local +libdir = $(prefix)/lib +includedir = $(prefix)/include +DESTDIR = +BLDDIR = build + +CFLAGS = -g -Wall -O2 +LDFLAGS = +ALL_CFLAGS = -fPIC $(CFLAGS) +LIBS = -lm + +INSTALL = install +CC = gcc +AR = ar + +### dynamic shared object ### + +# shared-object version does not follow software release version +SO_VER_MAJOR = 1 +SO_VER_MINOR = 1 + +SO_VER = $(SO_VER_MAJOR).$(SO_VER_MINOR) +SO_NAME = so +SO_CMD = -soname +SO_EXT_MAJOR = $(SO_NAME).$(SO_VER_MAJOR) +SO_EXT = $(SO_NAME).$(SO_VER) +UNAME := $(shell uname -s) +ifeq ($(UNAME),Darwin) + SO_NAME = dylib + SO_CMD = -install_name + SO_EXT_MAJOR = $(SO_VER_MAJOR).$(SO_NAME) + SO_EXT = $(SO_VER).$(SO_NAME) +endif +SHARED_LDFLAGS = -shared -Wl,$(SO_CMD),libdablooms.$(SO_EXT_MAJOR) + +### sources and outputs ### + +SRCS_LIBDABLOOMS = dablooms.c murmur.c +SRCS_TESTS = test_dablooms.c + +OBJS_LIBDABLOOMS = $(patsubst %.c, $(BLDDIR)/%.o, $(SRCS_LIBDABLOOMS)) +OBJS_TESTS = $(patsubst %.c, $(BLDDIR)/%.o, $(SRCS_TESTS)) + +LIB_SYMLNKS = libdablooms.$(SO_NAME) libdablooms.$(SO_EXT_MAJOR) +LIB_FILES = libdablooms.a libdablooms.$(SO_EXT) $(LIB_SYMLNKS) + +# for tests +WORDS = /usr/share/dict/words + +### rules ### + +# default target (needs to be first target) +all: libdablooms + +# sort removes duplicates +DEPS := $(sort $(patsubst %.o, %.o.deps, $(OBJS_LIBDABLOOMS) $(OBJS_TESTS))) +-include $(DEPS) + +libdablooms: $(patsubst %, $(BLDDIR)/%, $(LIB_FILES)) + +install: install_libdablooms + +install_libdablooms: $(patsubst %, $(DESTDIR)$(libdir)/%, $(LIB_FILES)) $(DESTDIR)$(includedir)/dablooms.h + +$(DESTDIR)$(libdir)/libdablooms.a: $(BLDDIR)/libdablooms.a + +$(DESTDIR)$(libdir)/libdablooms.$(SO_EXT): $(BLDDIR)/libdablooms.$(SO_EXT) + +$(patsubst %, $(DESTDIR)$(libdir)/%, $(LIB_SYMLNKS)): %: $(DESTDIR)$(libdir)/libdablooms.$(SO_EXT) + @echo " SYMLNK " $@ + @$(INSTALL) -d $(dir $@) + @ln -fs $(notdir $<) $@ + +$(DESTDIR)$(includedir)/dablooms.h: src/dablooms.h + +$(DESTDIR)$(prefix)/%: + @echo " INSTALL " $@ + @$(INSTALL) -d $(dir $@) + @$(INSTALL) $< $@ + +$(BLDDIR)/%.o: src/%.c + @echo " CC " $@ + @mkdir -p $(dir $@) + @$(CC) -o $@ -c $< $(ALL_CFLAGS) -MMD -MF [email protected] + +$(BLDDIR)/libdablooms.a: $(OBJS_LIBDABLOOMS) + @echo " AR " $@ + @rm -f $@ + @$(AR) rcs $@ $^ + +$(BLDDIR)/libdablooms.$(SO_EXT): $(OBJS_LIBDABLOOMS) + @echo " SO " $@ + @$(CC) -o $@ $(ALL_CFLAGS) $(SHARED_LDFLAGS) $(LDFLAGS) $^ $(LIBS) + +$(patsubst %, $(BLDDIR)/%, $(LIB_SYMLNKS)): %: $(BLDDIR)/libdablooms.$(SO_EXT) + @echo " SYMLNK " $@ + @mkdir -p $(dir $@) + @ln -fs $(notdir $<) $@ + +$(BLDDIR)/test_dablooms: $(OBJS_TESTS) $(BLDDIR)/libdablooms.a + @echo " LD " $@ + @$(CC) -o $@ $(ALL_CFLAGS) $(LDFLAGS) $(OBJS_TESTS) $(BLDDIR)/libdablooms.a $(LIBS) + +test: $(BLDDIR)/test_dablooms + @$(BLDDIR)/test_dablooms $(BLDDIR)/testbloom.bin $(WORDS) + +help: + @printf $(HELPTEXT) + +clean: + rm -f $(DEPS) $(OBJS_LIBDABLOOMS) $(patsubst %, $(BLDDIR)/%, $(LIB_FILES)) $(OBJS_TESTS) $(BLDDIR)/test_dablooms $(BLDDIR)/testbloom.bin + -rmdir $(BLDDIR) + +# .PHONY: all clean help install test libdablooms install_libdablooms + +### pydablooms ### + +# PYTHON = python +# PY_BLDDIR = $(BLDDIR)/python +# PY_MOD_DIR_ARG = # optional: --user or --system +# PY_MOD_DIR := $(shell $(PYTHON) pydablooms/modpath.py $(PY_MOD_DIR_ARG)) +# PY_FLAGS = --build-lib=$(PY_BLDDIR) --build-temp=$(PY_BLDDIR) +# PY_BLD_ENV = BLDDIR="$(BLDDIR)" + +# pydablooms: $(PY_BLDDIR)/pydablooms.so + +# install_pydablooms: $(DESTDIR)$(PY_MOD_DIR)/pydablooms.so + +# $(DESTDIR)$(PY_MOD_DIR)/pydablooms.so: $(PY_BLDDIR)/pydablooms.so +# @echo " PY_INSTALL " $@ +# @$(INSTALL) -d $(dir $@) +# @$(INSTALL) $< $@ + +# $(PY_BLDDIR)/pydablooms.so: pydablooms/pydablooms.c src/dablooms.c src/murmur.c +# @echo " PY_BUILD" $@ +# @$(PY_BLD_ENV) $(PYTHON) pydablooms/setup.py build $(PY_FLAGS) >/dev/null + +# test_pydablooms: pydablooms +# @PYTHONPATH=$(PY_BLDDIR) $(PYTHON) pydablooms/test_pydablooms.py $(BLDDIR)/testbloom_py.bin $(WORDS) + +# clean: clean_pydablooms +# clean_pydablooms: +# rm -f $(BLDDIR)/pydablooms.so $(BLDDIR)/testbloom_py.bin +# $(PYTHON) pydablooms/setup.py clean $(PY_FLAGS) + +# .PHONY: pydablooms install_pydablooms test_pydablooms clean_pydablooms diff --git a/bindings/rs-dablooms/dablooms/README.md b/bindings/rs-dablooms/dablooms/README.md new file mode 100644 index 0000000..d606233 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/README.md @@ -0,0 +1,259 @@ +Dablooms: A Scalable, Counting, Bloom Filter +---------------------------------- + +_Note_: this project has been mostly unmaintained for a while. + +### Overview +This project aims to demonstrate a novel Bloom filter implementation that can +scale, and provide not only the addition of new members, but reliable removal +of existing members. + +Bloom filters are a probabilistic data structure that provide space-efficient +storage of elements at the cost of possible false positive on membership +queries. + +**dablooms** implements such a structure that takes additional metadata to classify +elements in order to make an intelligent decision as to which Bloom filter an element +should belong. + +### Features +**dablooms**, in addition to the above, has several features. + +* Implemented as a static C library +* Memory mapped +* 4 bit counters +* Sequence counters for clean/dirty checks +* Python wrapper + +For performance, the low-level operations are implemented in C. It is also +memory mapped which provides async flushing and persistence at low cost. +In an effort to maintain memory efficiency, rather than using integers, or +even whole bytes as counters, we use only four bit counters. These four bit +counters allow up to 15 items to share a counter in the map. If more than a +small handful are sharing said counter, the Bloom filter would be overloaded +(resulting in excessive false positives anyway) at any sane error rate, so +there is no benefit in supporting larger counters. + +The Bloom filter also employs change sequence numbers to track operations performed +on the Bloom filter. These allow the application to determine if a write might have +only partially completed (perhaps due to a crash), leaving the filter in an +inconsistent state. The application can thus determine if a filter is ok or needs +to be recreated. The sequence number can be used to determine what a consistent but +out-of-date filter missed, and bring it up-to-date. + +There are two sequence numbers (and helper functions to get them): "mem_seqnum" and +"disk_seqnum". The "mem" variant is useful if the user is sure the OS didn't crash, +and the "disk" variant is useful if the OS might have crashed since the Bloom filter +was last changed. Both values could be "0", meaning the filter is possibly +inconsistent from their point of view, or a non-zero sequence number that the filter +is consistent with. The "mem" variant is often non-zero, but the "disk" variant only +becomes non-zero right after a (manual) flush. This can be expensive (it's an fsync), +so the value can be ignored if not relevant for the application. For example, if the +Bloom file exists in a directory which is cleared at boot (like `/tmp`), then the +application can safely assume that any existing file was not affected by an OS crash, +and never bother to flush or check disk_seqnum. Schemes involving batching up changes +are also possible. + +The dablooms library is not inherently thread safe, this is the clients responsibility. +Bindings are also not thread safe, unless they state otherwise. + +### Installing +Clone the repo, or download and extract a tarball of a tagged version +[from github](https://github.com/bitly/dablooms/tags). +In the source tree, type `make`, `make install` (`sudo` may be needed). +This will only install static and dynamic versions of the C dablooms library "libdablooms". + +To use a specific build directory, install prefix, or destination directory for packaging, +specify `BLDDIR`, `prefix`, or `DESTDIR` to make. For example: +`make install BLDDIR=/tmp/dablooms/bld DESTDIR=/tmp/dablooms/pkg prefix=/usr` + +Look at the output of `make help` for more options and targets. + +Also available are bindings for various other languages: + +#### Python (pydablooms) +To install the Python bindings "pydablooms" (currently only compatibly with python 2.x) +run `make pydablooms`, `make install_pydablooms` (`sudo` may be needed). + +To use and install for a specific version of Python installed on your system, +use the `PYTHON` option to make. For example: `make install_pydablooms PYTHON=python2.7`. +You can override the module install location with the `PY_MOD_DIR` option to make, +and the `BLDDIR` and `DESTDIR` options also affect pydablooms. + +The Makefile attempts to determine the python module location `PY_MOD_DIR` +automatically. It prefers a location in `/usr/local`, but you can specify +`PY_MOD_DIR_ARG=--user` to try to use the location which `pip install --user` +would use in your HOME dir. You can instead specify `PY_MOD_DIR_ARG=--system` +to prefer the normal/central system python module dir. + +See pydablooms/README.md for more info. + +#### Go (godablooms) +The Go bindings "godablooms" are not integrated into the Makefile. +Install libdablooms first, then look at `godablooms/README.md` + +### Contributing +If you make changes to C portions of dablooms which you would like merged into the +upstream repository, it would help to have your code match our C coding style. We use +[astyle](http://astyle.sourceforge.net/), svn rev 353 or later, on our code, with the +following options: + + astyle --style=1tbs --lineend=linux --convert-tabs --preserve-date \ + --fill-empty-lines --pad-header --indent-switches \ + --align-pointer=name --align-reference=name --pad-oper -n <files> + +### Testing +To run a quick and dirty test, type `make test`. This test uses a list of words +and defaults to `/usr/share/dict/words`. If your path differs, you can use the +`WORDS` flag to specific its location, such as `make test WORDS=/usr/dict/words`. + +This will run a simple test that iterates through a word list and +adds each word to dablooms. It iterates again, removing every fifth +element. Lastly, it saves the file, opens a new filter, and iterates a third time +checking the existence of each word. It prints results of the true negatives, +false positives, true positives, and false negatives, and the false positive rate. + +The false positive rate is calculated by "false positives / (false positivies + true negatives)". +That is, what rate of real negatives are false positives. This is the interesting +statistic because the rate of false negatives should always be zero. + +The test uses a maximum error rate of .05 (5%) and an initial capacity of 100k. If +the dictionary is near 500k, we should have created 4 new filters in order to scale to size. + +A second test adds every other word in the list, and removes no words, causing each +used filter to stay at maximum capacity, which is a worse case for accuracy. + +Check out the performance yourself, and checkout the size of the resulting file! + +## Bloom Filter Basics +Bloom filters are probabilistic data structures that provide +space-efficient storage of elements at the cost of occasional false positives on +membership queries, i.e. a Bloom filter may state true on query when it in fact does +not contain said element. A Bloom filter is traditionally implemented as an array of +`M` bits, where `M` is the size of the Bloom filter. On initialization all bits are +set to zero. A filter is also parameterized by a constant `k` that defines the number +of hash functions used to set and test bits in the filter. Each hash function should +output one index in `M`. When inserting an element `x` into the filter, the bits +in the `k` indices `h1(x), h2(x), ..., hk(X)` are set. + +In order to query a Bloom filter, say for element `x`, it suffices to verify if +all bits in indices `h1(x), h2(x), ..., hk(x)` are set. If one or more of these +bits is not set then the queried element is definitely not present in the +filter. However, if all these bits are set, then the element is considered to +be in the filter. Given this procedure, an error probability exists for positive +matches, since the tested indices might have been set by the insertion of other +elements. + +### Counting Bloom Filters: Solving Removals +The same property that results in false positives *also* makes it +difficult to remove an element from the filter as there is no +easy means of discerning if another element is hashed to the same bit. +Unsetting a bit that is hashed by multiple elements can cause **false +negatives**. Using a counter, instead of a bit, can circumvent this issue. +The bit can be incremented when an element is hashed to a +given location, and decremented upon removal. Membership queries rely on whether a +given counter is greater than zero. This reduces the exceptional +space-efficiency provided by the standard Bloom filter. + +### Scalable Bloom Filters: Solving Scale +Another important property of a Bloom filter is its linear relationship between size +and storage capacity. If the maximum allowable error probability and the number of elements to store +are both known, it is relatively straightforward to dimension an appropriate +filter. However, it is not always possible to know how many elements +will need to be stored a priori. There is a trade off between over-dimensioning filters or +suffering from a ballooning error probability as it fills. + +Almeida, Baquero, Preguiça, Hutchison published a paper in 2006, on +[Scalable Bloom Filters](http://www.sciencedirect.com/science/article/pii/S0020019006003127), +which suggested a means of scalable Bloom filters by creating essentially +a list of Bloom filters that act as one large Bloom filter. When greater +capacity is desired, a new filter is added to the list. + +Membership queries are conducted on each filter with the positives +evaluated if the element is found in any one of the filters. Naively, this +leads to an increasing compounding error probability since the probability +of the given structure evaluates to: + + 1 - 𝚺(1 - P) + +It is possible to bound this error probability by adding a reducing tightening +ratio, `r`. As a result, the bounded error probability is represented as: + + 1 - 𝚺(1 - P0 * r^i) where r is chosen as 0 < r < 1 + +Since size is simply a function of an error probability and capacity, any +array of growth functions can be applied to scale the size of the Bloom filter +as necessary. We found it sufficient to pick .9 for `r`. + +## Problems with Mixing Scalable and Counting Bloom Filters +Scalable Bloom filters do not allow for the removal of elements from the filter. +In addition, simply converting each Bloom filter in a scalable Bloom filter into +a counting filter also poses problems. Since an element can be in any filter, and +Bloom filters inherently allow for false positives, a given element may appear to +be in two or more filters. If an element is inadvertently removed from a filter +which did not contain it, it would introduce the possibility of **false negatives**. + +If however, an element can be removed from the correct filter, it maintains +the integrity of said filter, i.e. prevents the possibility of false negatives. Thus, +a scaling, counting, Bloom filter is possible if upon additions and deletions +one can correctly decide which Bloom filter contains the element. + +There are several advantages to using a Bloom filter. A Bloom filter gives the +application cheap, memory efficient set operations, with no actual data stored +about the given element. Rather, Bloom filters allow the application to test, +with some given error probability, the membership of an item. This leads to the +conclusion that the majority of operations performed on Bloom filters are the +queries of membership, rather than the addition and removal of elements. Thus, +for a scaling, counting, Bloom filter, we can optimize for membership queries at +the expense of additions and removals. This expense comes not in performance, +but in the addition of more metadata concerning an element and its relation to +the Bloom filter. With the addition of some sort of identification of an +element, which does not need to be unique as long as it is fairly distributed, it +is possible to correctly determine which filter an element belongs to, thereby able +to maintain the integrity of a given Bloom filter with accurate additions +and removals. + +## Enter dablooms +dablooms is one such implementation of a scaling, counting, Bloom filter that takes +additional metadata during additions and deletions in the form of a (generally) +monotonically increasing integer to classify elements (possibly a timestamp). +This is used during additions/removals to easily determine the correct Bloom filter +for an element (each filter is assigned a range). Checking an item against the Bloom +filter, which is assumed to be the dominant activity, does not use the id (it works +like a normal scaling Bloom filter). + +dablooms is designed to scale itself using these identifiers and the given capacity. +When a Bloom filter is at capacity, dablooms will create a new Bloom filter which +starts at the next id after the greatest id of the previous Bloom filter. Given the +fact that the identifiers monotonically increase, new elements will be added to the +newest Bloom filter. Note, in theory and as implemented, nothing prevents one from +adding an element to any "older" filter. You just run the increasing risk of the +error probability growing beyond the bound as it becomes "overfilled". + +You can then remove any element from any Bloom filter using the identifier to intelligently +pick which Bloom filter to remove from. Consequently, as you continue to remove elements +from Bloom filters that you are not continuing to add to, these Bloom filters will become +more accurate. + +The "id" of an element does not need to be known to check the Bloom filter, but does need +to be known when the element is removed (and the same as when it was added). This might +be convenient if the item already has an appropriate id (almost always increasing for new +items) associated with it. + +### Example use case +There is a database with a collection of entries. There is a series of items, each of which +you want to look up in the database; most will have no entry in the database, but some +will. Perhaps it's a database of spam links. If you use dablooms in front of the database, +you can avoid needing to check the database for almost all items which won't be found in +it anyway, and save a lot of time and effort. It's also much easier to distribute the +Bloom filter than the entire database. But to make it work, you need to determine an "id" +whenever you add to or remove from the Bloom filter. You could store the timestamp when +you add the item to the database as another column in the database, and give it to +`scaling_bloom_add()` as well. When you remove the item, you look it up in the database +first and pass the timestamp stored there to `scaling_bloom_remove()`. The timestamps for +new items will be equal or greater, and definitely greater over time. Instead of +timestamps, you could also use an auto-incrementing index. Checks against the Bloom +don't need to know the id and should be quick. If a check comes back negative, you can be +sure the item isn't in the database, and skip that query completely. If a check comes +back positive, you have to query the database, because there's a slight chance that the +item isn't actually in there. diff --git a/bindings/rs-dablooms/dablooms/godablooms/README.md b/bindings/rs-dablooms/dablooms/godablooms/README.md new file mode 100644 index 0000000..971e6cc --- /dev/null +++ b/bindings/rs-dablooms/dablooms/godablooms/README.md @@ -0,0 +1,15 @@ +godablooms +========== + +For the Go package you can install outside of `make` via: + + $ go get github.com/bitly/dablooms/godablooms + +However, we recommend using [go-install-as](https://github.com/mreiferson/go-install-as): + + $ go tool install_as --import-as=bitly/dablooms + +To run tests: + + $ go test + diff --git a/bindings/rs-dablooms/dablooms/godablooms/dablooms.go b/bindings/rs-dablooms/dablooms/godablooms/dablooms.go new file mode 100644 index 0000000..14fb63d --- /dev/null +++ b/bindings/rs-dablooms/dablooms/godablooms/dablooms.go @@ -0,0 +1,73 @@ +package dablooms + +/* +#cgo LDFLAGS: -ldablooms + +#include <stdlib.h> +#include <dablooms.h> +*/ +import "C" + +import ( + "unsafe" +) + +func Version() string { + return "0.9.0" +} + +type ScalingBloom struct { + cfilter *C.scaling_bloom_t +} + +func NewScalingBloom(capacity C.uint, errorRate C.double, filename string) *ScalingBloom { + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + sb := &ScalingBloom{ + cfilter: C.new_scaling_bloom(capacity, errorRate, cFilename), + } + return sb +} + +func NewScalingBloomFromFile(capacity C.uint, errorRate C.double, filename string) *ScalingBloom { + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + sb := &ScalingBloom{ + cfilter: C.new_scaling_bloom_from_file(capacity, errorRate, cFilename), + } + return sb +} + +// apparently this is an unsupported feature of cgo +// we should probably use runtime.SetFinalizer +// see: https://groups.google.com/forum/?fromgroups#!topic/golang-dev/5cD0EmU2voI +func (sb *ScalingBloom) Destroy() { + C.free_scaling_bloom(sb.cfilter) +} + +func (sb *ScalingBloom) Check(key []byte) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_check(sb.cfilter, cKey, C.size_t(len(key))) == 1 +} + +func (sb *ScalingBloom) Add(key []byte, id C.uint64_t) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_add(sb.cfilter, cKey, C.size_t(len(key)), id) == 1 +} + +func (sb *ScalingBloom) Remove(key []byte, id C.uint64_t) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_remove(sb.cfilter, cKey, C.size_t(len(key)), id) == 1 +} + +func (sb *ScalingBloom) Flush() bool { + return C.scaling_bloom_flush(sb.cfilter) == 1 +} + +func (sb *ScalingBloom) MemSeqNum() C.uint64_t { + return C.scaling_bloom_mem_seqnum(sb.cfilter) +} + +func (sb *ScalingBloom) DiskSeqNum() C.uint64_t { + return C.scaling_bloom_disk_seqnum(sb.cfilter) +} diff --git a/bindings/rs-dablooms/dablooms/godablooms/dablooms_test.go b/bindings/rs-dablooms/dablooms/godablooms/dablooms_test.go new file mode 100644 index 0000000..826124f --- /dev/null +++ b/bindings/rs-dablooms/dablooms/godablooms/dablooms_test.go @@ -0,0 +1,40 @@ +package dablooms + +import ( + "io/ioutil" + "log" + "os" + "testing" +) + +func TestPutMessage(t *testing.T) { + log.SetOutput(ioutil.Discard) + defer log.SetOutput(os.Stdout) + + sb := NewScalingBloom(1000, 0.5, "testbloom.bin") + if sb == nil { + t.Fatalf("NewScalingBloom failed") + } + + key := []byte("test") + + if sb.Check(key) != false { + t.Fatalf("Check failed") + } + + if sb.Add(key, 1) != true { + t.Fatalf("Add failed") + } + + if sb.Check(key) != true { + t.Fatalf("'%s' not found", key) + } + + if sb.Remove(key, 1) != true { + t.Fatalf("Remove failed") + } + + if sb.Check(key) != false { + t.Fatalf("'%s' was found (after remove)", key) + } +} diff --git a/bindings/rs-dablooms/dablooms/pydablooms/README.md b/bindings/rs-dablooms/dablooms/pydablooms/README.md new file mode 100644 index 0000000..c3534e4 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/pydablooms/README.md @@ -0,0 +1,28 @@ +pydablooms +========== + +Python language bindings for dablooms. See the main dablooms `README.md` +for build and install instructions. There is also a `test_pydablooms` +target in the Makefile (remember to specify any options to make you used +during the build). + + +### Example usage + + >>> import pydablooms + >>> bloom = pydablooms.Dablooms(capacity=1000, + ... error_rate=.05, + ... filepath='/tmp/bloom.bin') + >>> bloom.add('foo', 2) + 1 + >>> bloom.check('bar') + 0 + >>> bloom.delete('foo', 2) + 0 + >>> bloom.check('foo') + 0 + >>> del bloom + >>> bloom = pydablooms.load_dabloom(capacity=1000, + ... error_rate=.05, + ... filepath='/tmp/bloom.bin') + >>> diff --git a/bindings/rs-dablooms/dablooms/pydablooms/modpath.py b/bindings/rs-dablooms/dablooms/pydablooms/modpath.py new file mode 100644 index 0000000..bf9faee --- /dev/null +++ b/bindings/rs-dablooms/dablooms/pydablooms/modpath.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +import os +import sys +import distutils.sysconfig + +user = False +system = False + +def err_exit(msg): + sys.stderr.write(msg + "\n") + exit(1) + +for arg in sys.argv[1:]: + if arg == "--user": user = True + elif arg == "--system": system = True + else: + err_exit("invalid argument '%s'" % arg) + +mod_paths = sys.path +mod_paths.reverse() + +if user: + home = os.environ.get('HOME') + if not home: + err_exit("environment variable 'HOME' not set") + for p in mod_paths: + if p.startswith(home): + print(p) + exit(0) + err_exit("no user python module path found") + +if not system: + for p in mod_paths: + if p.startswith("/usr/local/"): + print(p) + exit(0) + +print(distutils.sysconfig.get_python_lib()) diff --git a/bindings/rs-dablooms/dablooms/pydablooms/pydablooms.c b/bindings/rs-dablooms/dablooms/pydablooms/pydablooms.c new file mode 100644 index 0000000..c101fc2 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/pydablooms/pydablooms.c @@ -0,0 +1,225 @@ +#include <Python.h> +#include "dablooms.h" +#include "structmember.h" + +int Py_ModuleVersion = 1; + +typedef struct { + PyObject_HEAD + scaling_bloom_t *filter; /* Type-specific fields go here. */ +} Dablooms; + +static void Dablooms_dealloc(Dablooms *self) +{ + free_scaling_bloom(self->filter); + self->ob_type->tp_free((PyObject *)self); +} + +static PyObject *Dablooms_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + Dablooms *self; + + if ((self = (Dablooms *)type->tp_alloc(type, 0)) == NULL) { + return NULL; + } + + self->filter = NULL; + + return (PyObject *) self; +} + +static int Dablooms_init(Dablooms *self, PyObject *args, PyObject *kwds) +{ + double error_rate; + const char *filepath; + unsigned int capacity; + static char *kwlist[] = {"capacity", "error_rate", "filepath", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "|ids", kwlist, + &capacity, &error_rate, &filepath)) { + return -1; + } + + self->filter = new_scaling_bloom(capacity, error_rate, filepath); + + return 0; +} + + +static int contains(Dablooms *self, PyObject *key) +{ + const char *hash; + int len; + + if (!PyArg_Parse(key, "s#", &hash, &len)) { + return -1; + } + return scaling_bloom_check(self->filter, hash, len); +} + +static PyObject *check(Dablooms *self, PyObject *args) +{ + const char *hash; + int len; + + if (!PyArg_ParseTuple(args, "s#", &hash, &len)) { + return NULL; + } + return Py_BuildValue("i", scaling_bloom_check(self->filter, hash, len)); +} + +static PyObject *add(Dablooms *self, PyObject *args, PyObject *kwds) +{ + const char *hash; + int len; + unsigned long long id; + static char *kwlist[] = {"hash", "id", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "|s#K", kwlist, &hash, &len, &id)) { + return NULL; + } + + return Py_BuildValue("i", scaling_bloom_add(self->filter, hash, len, id)); +} + +static PyObject *delete(Dablooms *self, PyObject *args, PyObject *kwds) +{ + const char *hash; + int len; + unsigned long long id; + static char *kwlist[] = {"hash", "id", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "|s#K", kwlist, &hash, &len, &id)) { + return NULL; + } + + return Py_BuildValue("i", scaling_bloom_remove(self->filter, hash, len, id)); +} + +static PyObject *flush(Dablooms *self, PyObject *args, PyObject *kwds) +{ + return Py_BuildValue("i", scaling_bloom_flush(self->filter)); +} + +static PyObject *mem_seqnum(Dablooms *self, PyObject *args, PyObject *kwds) +{ + return Py_BuildValue("K", scaling_bloom_mem_seqnum(self->filter)); +} + +static PyObject *disk_seqnum(Dablooms *self, PyObject *args, PyObject *kwds) +{ + return Py_BuildValue("K", scaling_bloom_disk_seqnum(self->filter)); +} + +static PyMethodDef Dablooms_methods[] = { + {"add", (PyCFunction)add, METH_VARARGS | METH_KEYWORDS, "Add an element to the bloom filter."}, + {"delete", (PyCFunction)delete, METH_VARARGS | METH_KEYWORDS, "Remove an element from the bloom filter."}, + {"check", (PyCFunction)check, METH_VARARGS | METH_KEYWORDS, "Check if an element is in the bloom filter."}, + {"flush", (PyCFunction)flush, METH_VARARGS | METH_KEYWORDS, "Flush a bloom filter to file."}, + {"mem_seqnum", (PyCFunction)mem_seqnum, METH_VARARGS | METH_KEYWORDS, "Get the memory-consistent sequence number."}, + {"disk_seqnum", (PyCFunction)disk_seqnum, METH_VARARGS | METH_KEYWORDS, "Get the disk-consistent sequence number."}, + {NULL}, /* Sentinel */ +}; + +static PyMemberDef Dablooms_members[] = { + {NULL} /* Sentinel */ +}; + +static PySequenceMethods Dablooms_sequence = { + NULL, /*sq_length*/ + NULL, /*sq_concat*/ + NULL, /*sq_repeat*/ + NULL, /*sq_item*/ + NULL, /*sq_slice*/ + NULL, /*sq_ass_item*/ + NULL, /*sq_ass_slice*/ + (objobjproc)contains, /*sq_contains*/ +}; + +static PyTypeObject DabloomsType = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "pydablooms.Dablooms", /*tp_name*/ + sizeof(Dablooms), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)Dablooms_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + &Dablooms_sequence, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + "Dablooms objects", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + Dablooms_methods, /*tp_methods*/ + Dablooms_members, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + (initproc)Dablooms_init, /*tp_init*/ + 0, /*tp_alloc*/ + Dablooms_new, /*tp_new*/ +}; + +static PyObject *load_dabloom(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + Dablooms *self = (Dablooms *)PyObject_New(Dablooms, &DabloomsType); + double error_rate; + const char *filepath; + unsigned int capacity; + static char *kwlist[] = {"capacity", "error_rate", "filepath", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "|ids", kwlist, + &capacity, &error_rate, &filepath)) { + return NULL; + } + + self->filter = new_scaling_bloom_from_file(capacity, error_rate, filepath); + return (PyObject *) self; +} + +static PyMethodDef pydablooms_methods[] = { + {"load_dabloom", (PyCFunction)load_dabloom, METH_VARARGS | METH_KEYWORDS, "Add an element to the bloom filter."}, + {NULL} +}; + +#ifndef PyMODINIT_FUNC +#define PyMODINIT_FUNC void +#endif + + +PyMODINIT_FUNC initpydablooms(void) +{ + PyObject *m; + if (PyType_Ready(&DabloomsType) < 0) { + return; + } + + m = Py_InitModule3("pydablooms", pydablooms_methods, "Dablooms module"); + + if (m == NULL) { + return; + } + + PyModule_AddObject(m, "__version__", PyString_FromString(dablooms_version())); + + Py_INCREF(&DabloomsType); + PyModule_AddObject(m, "Dablooms", (PyObject *)&DabloomsType); +} diff --git a/bindings/rs-dablooms/dablooms/pydablooms/setup.py b/bindings/rs-dablooms/dablooms/pydablooms/setup.py new file mode 100644 index 0000000..add7ba5 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/pydablooms/setup.py @@ -0,0 +1,35 @@ +from distutils.core import setup, Extension +import os, sys + +def local_path(path): + local_dir = os.path.dirname(__file__) + return os.path.normpath(os.path.join(local_dir, path)) + +def parse_version_from_c(): + cfile = open(local_path('../src/dablooms.c')) + result = '' + for line in cfile: + parts = line.split() + if len(parts) == 3 and parts[:2] == ['#define', 'DABLOOMS_VERSION']: + result = parts[2].strip('"') + break + cfile.close() + return result + +def path_from_env(name, default): + return os.environ.get(name, local_path(default)) + +module1 = Extension('pydablooms', + include_dirs = [local_path('../src')], + sources = [local_path('pydablooms.c'), + local_path('../src/dablooms.c'), + local_path('../src/murmur.c'), ], + ) + +setup (name = 'pydablooms', + version = parse_version_from_c(), + description = 'This is a a python extension of the scaling, counting, bloom filter, dablooms.', + author = 'Justin P. Hines', + author_email = '[email protected]', + url = 'http://github.com/bitly/dablooms.git', + ext_modules = [module1]) diff --git a/bindings/rs-dablooms/dablooms/pydablooms/test_pydablooms.py b/bindings/rs-dablooms/dablooms/pydablooms/test_pydablooms.py new file mode 100644 index 0000000..227433e --- /dev/null +++ b/bindings/rs-dablooms/dablooms/pydablooms/test_pydablooms.py @@ -0,0 +1,104 @@ +import sys, os +import pydablooms + +capacity = 100000 +error_rate = 0.05 + +print("pydablooms version: %s" % pydablooms.__version__) + +if len(sys.argv) != 3: + sys.stderr.write("Usage: %s <bloom_file> <words_file>\n" % sys.argv[0]) + sys.exit(1) + +bloom_fname = sys.argv[1] +words_fname = sys.argv[2] + +bloom = pydablooms.Dablooms(capacity=capacity, + error_rate=error_rate, + filepath=bloom_fname) + +words_file = open(words_fname, 'rb') +i = 0 +for line in words_file: + bloom.add(line.rstrip(), i) + i += 1 + +words_file.seek(0) +i = 0 +for line in words_file: + if i % 5 == 0: + bloom.delete(line.rstrip(), i) + i += 1 + +bloom.flush() +del bloom + +bloom = pydablooms.load_dabloom(capacity=capacity, + error_rate=error_rate, + filepath=bloom_fname) + +true_positives = 0 +true_negatives = 0 +false_positives = 0 +false_negatives = 0 + +words_file.seek(0) +i = 0 +for line in words_file: + exists = bloom.check(line.rstrip()) + contains = line.rstrip() in bloom + assert exists == contains, \ + "ERROR: %r from 'bloom.check(x)', %i from 'x in bloom'" \ + % (exists, contains) + + if i % 5 == 0: + if exists: + false_positives += 1 + else: + true_negatives += 1 + else: + if exists: + true_positives += 1 + else: + false_negatives += 1 + sys.stderr.write("ERROR: False negative: '%s'\n" % line.rstrip()) + i += 1 + +words_file.close() +del bloom + +false_positive_rate = float(false_positives) / (false_positives + true_negatives) + +print(''' +Elements Added: %6d +Elements Removed: %6d + +True Positives: %6d +True Negatives: %6d +False Positives: %6d +False Negatives: %6d + +False positive rate: %.4f +Total Size: %d KiB''' % ( + i, i/5, + true_positives, + true_negatives, + false_positives, + false_negatives, + false_positive_rate, + os.stat(bloom_fname).st_size / 1024 + ) +) + +if false_negatives > 0: + print("TEST FAIL (false negatives exist)") +elif false_positive_rate > error_rate: + print("TEST WARN (false positive rate too high)") +else: + print("TEST PASS") +print("") + +if false_negatives > 0: + sys.exit(1) +else: + sys.exit(0) diff --git a/bindings/rs-dablooms/dablooms/src/dablooms.c b/bindings/rs-dablooms/dablooms/src/dablooms.c new file mode 100644 index 0000000..dc1ba7a --- /dev/null +++ b/bindings/rs-dablooms/dablooms/src/dablooms.c @@ -0,0 +1,649 @@ +/* Copyright @2012 by Justin Hines at Bitly under a very liberal license. See LICENSE in the source distribution. */ + +#include <sys/stat.h> +#include <stdint.h> +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <fcntl.h> +#include <math.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> +#include <errno.h> +#include <time.h> +#include "murmur.h" +#include "dablooms.h" + +#define DABLOOMS_VERSION "0.9.1" + +#define ERROR_TIGHTENING_RATIO 0.5 +#define SALT_CONSTANT 0x97c29b3a + +const char *dablooms_version(void) +{ + return DABLOOMS_VERSION; +} + +void free_bitmap(bitmap_t *bitmap) +{ +#if 0 + if ((munmap(bitmap->array, bitmap->bytes)) < 0) { + perror("Error, unmapping memory"); + } +#else + free(bitmap->array); +#endif + free(bitmap); +} + +bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t old_size, size_t new_size) +{ + +#if 0 + /* resize if mmap exists and possible on this os, else new mmap */ + if (bitmap->array != NULL) { +#if __linux + bitmap->array = mremap(bitmap->array, old_size, new_size, MREMAP_MAYMOVE); + if (bitmap->array == MAP_FAILED) { + perror("Error resizing mmap"); + free_bitmap(bitmap); + return NULL; + } +#else + if (munmap(bitmap->array, bitmap->bytes) < 0) { + perror("Error unmapping memory"); + free_bitmap(bitmap); + return NULL; + } + bitmap->array = NULL; +#endif + } + if (bitmap->array == NULL) { + bitmap->array = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (bitmap->array == MAP_FAILED) { + perror("Error init mmap"); + free_bitmap(bitmap); + return NULL; + } + } +#else + if (bitmap->array != NULL) + { + bitmap->array = (char *)realloc(bitmap->array, new_size); + if (bitmap->array == NULL) + { + perror("Error resizing memory"); + free_bitmap(bitmap); + return NULL; + } + memset(bitmap->array + old_size, 0, new_size - old_size); + } + else + { + bitmap->array = (char *)malloc(new_size); + if (bitmap->array == NULL) + { + perror("Error init memory"); + free_bitmap(bitmap); + return NULL; + } + memset(bitmap->array, 0, new_size); + } +#endif + bitmap->bytes = new_size; + return bitmap; +} + +/* Create a new bitmap, not full featured, simple to give + * us a means of interacting with the 4 bit counters */ +bitmap_t *new_bitmap(size_t bytes) +{ + bitmap_t *bitmap; + + if ((bitmap = (bitmap_t *)malloc(sizeof(bitmap_t))) == NULL) { + return NULL; + } + + bitmap->bytes = bytes; + bitmap->array = NULL; + + if ((bitmap = bitmap_resize(bitmap, 0, bytes)) == NULL) { + return NULL; + } + + return bitmap; +} + +int bitmap_increment(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + uint8_t temp; + __builtin_prefetch(&(bitmap->array[access]), 0, 1); + uint8_t n = bitmap->array[access]; + if (index % 2 != 0) { + temp = (n & 0x0f); + n = (n & 0xf0) + ((n & 0x0f) + 0x01); + } else { + temp = (n & 0xf0) >> 4; + n = (n & 0x0f) + ((n & 0xf0) + 0x10); + } + + if (temp == 0x0f) { + //fprintf(stderr, "Error, 4 bit int Overflow\n"); + return -1; + } + + __builtin_prefetch(&(bitmap->array[access]), 1, 1); + bitmap->array[access] = n; + return 0; +} + +/* increments the four bit counter */ +int bitmap_decrement(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + uint8_t temp; + uint8_t n = bitmap->array[access]; + + if (index % 2 != 0) { + temp = (n & 0x0f); + n = (n & 0xf0) + ((n & 0x0f) - 0x01); + } else { + temp = (n & 0xf0) >> 4; + n = (n & 0x0f) + ((n & 0xf0) - 0x10); + } + + if (temp == 0x00) { + //fprintf(stderr, "Error, Decrementing zero\n"); + return -1; + } + + bitmap->array[access] = n; + return 0; +} + +/* decrements the four bit counter */ +int bitmap_check(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + if (index % 2 != 0 ) { + return bitmap->array[access] & 0x0f; + } else { + return bitmap->array[access] & 0xf0; + } +} + +int bitmap_flush(bitmap_t *bitmap) +{ +#if 0 + if ((msync(bitmap->array, bitmap->bytes, MS_SYNC) < 0)) { + perror("Error, flushing bitmap to disk"); + return -1; + } else { + return 0; + } +#else + return 0; +#endif +} + +/* + * Perform the actual hashing for `key` + * + * Only call the hash once to get a pair of initial values (h1 and + * h2). Use these values to generate all hashes in a quick loop. + * + * See paper by Kirsch, Mitzenmacher [2006] + * http://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf + */ +void hash_func(counting_bloom_t *bloom, const char *key, size_t key_len, uint32_t *hashes) +{ + int i; + uint32_t checksum[4]; + + MurmurHash3_x64_128(key, key_len, SALT_CONSTANT, checksum); + uint32_t h1 = checksum[0]; + uint32_t h2 = checksum[1]; + + for (i = 0; i < bloom->nfuncs; i++) { + hashes[i] = (h1 + i * h2) % bloom->counts_per_func; + } +} + +int free_counting_bloom(counting_bloom_t *bloom) +{ + if (bloom != NULL) { + free(bloom->hashes); + bloom->hashes = NULL; + free_bitmap(bloom->bitmap); + free(bloom); + bloom = NULL; + } + return 0; +} + +counting_bloom_t *counting_bloom_init(unsigned int capacity, double error_rate, long offset) +{ + counting_bloom_t *bloom; + + if ((bloom = malloc(sizeof(counting_bloom_t))) == NULL) { + fprintf(stderr, "Error, could not realloc a new bloom filter\n"); + return NULL; + } + bloom->bitmap = NULL; + bloom->capacity = capacity; + bloom->error_rate = error_rate; + bloom->offset = offset + sizeof(counting_bloom_header_t); + bloom->nfuncs = (int) ceil(log(1 / error_rate) / log(2)); + bloom->counts_per_func = (int) ceil(capacity * fabs(log(error_rate)) / (bloom->nfuncs * pow(log(2), 2))); + bloom->size = bloom->nfuncs * bloom->counts_per_func; + /* rounding-up integer divide by 2 of bloom->size */ + bloom->num_bytes = ((bloom->size + 1) / 2) + sizeof(counting_bloom_header_t); + bloom->hashes = calloc(bloom->nfuncs, sizeof(uint32_t)); + + return bloom; +} + +counting_bloom_t *new_counting_bloom(unsigned int capacity, double error_rate) +{ + counting_bloom_t *cur_bloom; + + cur_bloom = counting_bloom_init(capacity, error_rate, 0); + cur_bloom->bitmap = new_bitmap(cur_bloom->num_bytes); + cur_bloom->header = (counting_bloom_header_t *)(cur_bloom->bitmap->array); + return cur_bloom; +} + +int counting_bloom_add(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + bitmap_increment(bloom->bitmap, index, bloom->offset); + } + bloom->header->count++; + + return 0; +} + +int counting_bloom_remove(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + bitmap_decrement(bloom->bitmap, index, bloom->offset); + } + bloom->header->count--; + + return 0; +} + +int counting_bloom_check(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + if (!(bitmap_check(bloom->bitmap, index, bloom->offset))) { + return 0; + } + } + return 1; +} + +int free_scaling_bloom(scaling_bloom_t *bloom) +{ + int i; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + free(bloom->blooms[i]->hashes); + bloom->blooms[i]->hashes = NULL; + free(bloom->blooms[i]); + bloom->blooms[i] = NULL; + } + free(bloom->blooms); + free_bitmap(bloom->bitmap); + free(bloom); + return 0; +} + +/* creates a new counting bloom filter from a given scaling bloom filter, with count and id */ +counting_bloom_t *new_counting_bloom_from_scale(scaling_bloom_t *bloom) +{ + int i; + long offset; + double error_rate; + counting_bloom_t *cur_bloom; + + error_rate = bloom->error_rate * (pow(ERROR_TIGHTENING_RATIO, bloom->num_blooms + 1)); + + if ((bloom->blooms = realloc(bloom->blooms, (bloom->num_blooms + 1) * sizeof(counting_bloom_t *))) == NULL) { + fprintf(stderr, "Error, could not realloc a new bloom filter\n"); + return NULL; + } + + cur_bloom = counting_bloom_init(bloom->capacity, error_rate, bloom->num_bytes); + bloom->blooms[bloom->num_blooms] = cur_bloom; + + bloom->bitmap = bitmap_resize(bloom->bitmap, bloom->num_bytes, bloom->num_bytes + cur_bloom->num_bytes); + + /* reset header pointer, as mmap may have moved */ + bloom->header = (scaling_bloom_header_t *) bloom->bitmap->array; + + /* Set the pointers for these header structs to the right location since mmap may have moved */ + bloom->num_blooms++; + for (i = 0; i < bloom->num_blooms; i++) { + offset = bloom->blooms[i]->offset - sizeof(counting_bloom_header_t); + bloom->blooms[i]->header = (counting_bloom_header_t *) (bloom->bitmap->array + offset); + } + + bloom->num_bytes += cur_bloom->num_bytes; + cur_bloom->bitmap = bloom->bitmap; + + return cur_bloom; +} + + +uint64_t scaling_bloom_clear_seqnums(scaling_bloom_t *bloom) +{ + uint64_t seqnum; + + if (bloom->header->disk_seqnum != 0) { + // disk_seqnum cleared on disk before any other changes + bloom->header->disk_seqnum = 0; + bitmap_flush(bloom->bitmap); + } + seqnum = bloom->header->mem_seqnum; + bloom->header->mem_seqnum = 0; + return seqnum; +} + +int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id) +{ + int i; + uint64_t seqnum; + + counting_bloom_t *cur_bloom = NULL; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (id >= cur_bloom->header->id) { + break; + } + } + + seqnum = scaling_bloom_clear_seqnums(bloom); + + if ((id > bloom->header->max_id) && (cur_bloom->header->count >= cur_bloom->capacity - 1)) { + cur_bloom = new_counting_bloom_from_scale(bloom); + cur_bloom->header->count = 0; + cur_bloom->header->id = bloom->header->max_id + 1; + } + if (bloom->header->max_id < id) { + bloom->header->max_id = id; + } + counting_bloom_add(cur_bloom, s, len); + + bloom->header->mem_seqnum = seqnum + 1; + + return 1; +} + +int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id) +{ + counting_bloom_t *cur_bloom; + int i; + uint64_t seqnum; + + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (id >= cur_bloom->header->id) { + seqnum = scaling_bloom_clear_seqnums(bloom); + + counting_bloom_remove(cur_bloom, s, len); + + bloom->header->mem_seqnum = seqnum + 1; + return 1; + } + } + return 0; +} + +int scaling_bloom_check(scaling_bloom_t *bloom, const char *s, size_t len) +{ + int i; + counting_bloom_t *cur_bloom; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (counting_bloom_check(cur_bloom, s, len)) { + return 1; + } + } + return 0; +} + +int scaling_bloom_flush(scaling_bloom_t *bloom) +{ + if (bitmap_flush(bloom->bitmap) != 0) { + return -1; + } + // all changes written to disk before disk_seqnum set + if (bloom->header->disk_seqnum == 0) { + bloom->header->disk_seqnum = bloom->header->mem_seqnum; + return bitmap_flush(bloom->bitmap); + } + return 0; +} + +uint64_t scaling_bloom_mem_seqnum(scaling_bloom_t *bloom) +{ + return bloom->header->mem_seqnum; +} + +uint64_t scaling_bloom_disk_seqnum(scaling_bloom_t *bloom) +{ + return bloom->header->disk_seqnum; +} + +scaling_bloom_t *scaling_bloom_init(unsigned int capacity, double error_rate) +{ + scaling_bloom_t *bloom; + + if ((bloom = malloc(sizeof(scaling_bloom_t))) == NULL) { + return NULL; + } + if ((bloom->bitmap = new_bitmap(sizeof(scaling_bloom_header_t))) == NULL) { + fprintf(stderr, "Error, Could not create bitmap with file\n"); + free_scaling_bloom(bloom); + return NULL; + } + + bloom->header = (scaling_bloom_header_t *) bloom->bitmap->array; + bloom->capacity = capacity; + bloom->error_rate = error_rate; + bloom->num_blooms = 0; + bloom->num_bytes = sizeof(scaling_bloom_header_t); + bloom->blooms = NULL; + + return bloom; +} + +scaling_bloom_t *new_scaling_bloom(unsigned int capacity, double error_rate) +{ + + scaling_bloom_t *bloom; + counting_bloom_t *cur_bloom; + + bloom = scaling_bloom_init(capacity, error_rate); + + if (!(cur_bloom = new_counting_bloom_from_scale(bloom))) { + fprintf(stderr, "Error, Could not create counting bloom\n"); + free_scaling_bloom(bloom); + return NULL; + } + cur_bloom->header->count = 0; + cur_bloom->header->id = 0; + + bloom->header->mem_seqnum = 1; + return bloom; +} + + +struct expiry_dablooms_handle{ + scaling_bloom_t *cur_bloom; + scaling_bloom_t *next_bloom; + time_t cur_bloom_start; + time_t next_bloom_start; + time_t last_bloom_check; + uint64_t cur_bloom_inc_id; + uint64_t next_bloom_inc_id; + unsigned int capacity; + int expiry_time; + time_t cur_time; + double error_rate; +}; + +char* expiry_dablooms_errno_trans(enum expiry_dablooms_errno _errno){ + switch(_errno){ + case EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL: + return (char*)"scaling_bloom_null"; + case EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL: + return (char*)"new_scaling_bloom_fail"; + default: + return (char*)"unknown"; + } +} + +void expiry_dablooms_destroy(struct expiry_dablooms_handle *handle){ + if(handle != NULL){ + if(handle->cur_bloom != NULL){ + free_scaling_bloom(handle->cur_bloom); + } + if(handle->next_bloom != NULL){ + free_scaling_bloom(handle->next_bloom); + } + FREE(&handle); + } +} + +struct expiry_dablooms_handle* expiry_dablooms_init(unsigned int capacity, double error_rate, time_t cur_time, int expiry_time){ + struct expiry_dablooms_handle *handle = ALLOC(struct expiry_dablooms_handle, 1); + scaling_bloom_t *cur_bloom = new_scaling_bloom(capacity, error_rate); + if(cur_bloom == NULL){ + goto error_out; + } + handle->cur_bloom = cur_bloom; + handle->cur_bloom_inc_id = 0; + handle->cur_bloom_start=cur_time; + handle->capacity = capacity; + handle->error_rate = error_rate; + handle->expiry_time = expiry_time; + handle->cur_time = cur_time; + return handle; + +error_out: + expiry_dablooms_destroy(handle); + return NULL; +} + +int expiry_dablooms_element_count_get(struct expiry_dablooms_handle *handle, uint64_t *count){ + if(handle == NULL || handle->cur_bloom == NULL){ + return EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL; + } + *count = handle->cur_bloom_inc_id; + return 0; +} + +static int bloom_expired_check(struct expiry_dablooms_handle *handle, time_t cur_time){ + if(handle == NULL || handle->cur_bloom == NULL){ + return EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL; + } + if(cur_time <= handle->last_bloom_check){ + return 0; + } + time_t delta_time = cur_time - handle->cur_bloom_start; + handle->cur_time=cur_time; + if(delta_time >= handle->expiry_time){ + free_scaling_bloom(handle->cur_bloom); + if(handle->next_bloom != NULL){ + handle->cur_bloom = handle->next_bloom; + handle->cur_bloom_start = handle->next_bloom_start; + handle->cur_bloom_inc_id = handle->next_bloom_inc_id; + handle->next_bloom = NULL; + handle->last_bloom_check=0; + } + else{ + scaling_bloom_t *cur_bloom = new_scaling_bloom(handle->capacity, handle->error_rate); + if(cur_bloom == NULL){ + return EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL; + } + handle->cur_bloom = cur_bloom; + handle->cur_bloom_inc_id = 0; + handle->cur_bloom_start=cur_time; + handle->last_bloom_check=0; + } + } + else + { + handle->last_bloom_check=cur_time; + } + return 0; +} + +int expiry_dablooms_add(struct expiry_dablooms_handle *handle, const char *key, size_t len, time_t cur_time){ + if(key==NULL || len ==0 || handle == NULL) + { + return -1; + } + int ret = bloom_expired_check(handle, cur_time); + if(ret < 0) + { + return ret; + } + + scaling_bloom_add(handle->cur_bloom, key, len, handle->cur_bloom_inc_id); + handle->cur_bloom_inc_id++; + time_t delta_time = cur_time - handle->cur_bloom_start; + handle->cur_time=cur_time; + if(delta_time >= handle->expiry_time){ + if(handle->next_bloom == NULL){ + scaling_bloom_t *next_bloom = new_scaling_bloom(handle->capacity, handle->error_rate); + if(next_bloom == NULL){ + return EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL; + } + handle->next_bloom = next_bloom; + handle->next_bloom_inc_id = 0; + handle->next_bloom_start=cur_time; + } + scaling_bloom_add(handle->next_bloom, key, len, handle->next_bloom_inc_id); + handle->next_bloom_inc_id++; + } + return 0; +} + +int expiry_dablooms_search(struct expiry_dablooms_handle *handle, const char *key, size_t len, time_t cur_time){ + if(key==NULL || len ==0 || handle == NULL) + { + return -1; + } + int ret = bloom_expired_check(handle, cur_time); + if(ret < 0) + { + return ret; + } + int bloom_hit = scaling_bloom_check(handle->cur_bloom, key, len); + return bloom_hit; +} diff --git a/bindings/rs-dablooms/dablooms/src/dablooms.h b/bindings/rs-dablooms/dablooms/src/dablooms.h new file mode 100644 index 0000000..ee4d4e3 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/src/dablooms.h @@ -0,0 +1,92 @@ +/* Copyright @2012 by Justin Hines at Bitly under a very liberal license. See LICENSE in the source distribution. */ + +#ifndef __BLOOM_H__ +#define __BLOOM_H__ +#include <stdint.h> +#include <stdlib.h> + +#define ALLOC(type, number) ((type *)calloc(sizeof(type), number)) +#define FREE(p) {free(*p);*p=NULL;} + +const char *dablooms_version(void); + +typedef struct { + size_t bytes; + char *array; +} bitmap_t; + + +bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t old_size, size_t new_size); +bitmap_t *new_bitmap(size_t bytes); + +int bitmap_increment(bitmap_t *bitmap, unsigned int index, long offset); +int bitmap_decrement(bitmap_t *bitmap, unsigned int index, long offset); +int bitmap_check(bitmap_t *bitmap, unsigned int index, long offset); +int bitmap_flush(bitmap_t *bitmap); + +void free_bitmap(bitmap_t *bitmap); + +typedef struct { + uint64_t id; + uint32_t count; + uint32_t _pad; +} counting_bloom_header_t; + + +typedef struct { + counting_bloom_header_t *header; + unsigned int capacity; + long offset; + unsigned int counts_per_func; + uint32_t *hashes; + size_t nfuncs; + size_t size; + size_t num_bytes; + double error_rate; + bitmap_t *bitmap; +} counting_bloom_t; + +int free_counting_bloom(counting_bloom_t *bloom); +counting_bloom_t *new_counting_bloom(unsigned int capacity, double error_rate); +int counting_bloom_add(counting_bloom_t *bloom, const char *s, size_t len); +int counting_bloom_remove(counting_bloom_t *bloom, const char *s, size_t len); +int counting_bloom_check(counting_bloom_t *bloom, const char *s, size_t len); + +typedef struct { + uint64_t max_id; + uint64_t mem_seqnum; + uint64_t disk_seqnum; +} scaling_bloom_header_t; + +typedef struct { + scaling_bloom_header_t *header; + unsigned int capacity; + unsigned int num_blooms; + size_t num_bytes; + double error_rate; + counting_bloom_t **blooms; + bitmap_t *bitmap; +} scaling_bloom_t; + +scaling_bloom_t *new_scaling_bloom(unsigned int capacity, double error_rate); +int free_scaling_bloom(scaling_bloom_t *bloom); +int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id); +int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id); +int scaling_bloom_check(scaling_bloom_t *bloom, const char *s, size_t len); +int scaling_bloom_flush(scaling_bloom_t *bloom); +uint64_t scaling_bloom_mem_seqnum(scaling_bloom_t *bloom); +uint64_t scaling_bloom_disk_seqnum(scaling_bloom_t *bloom); + +struct expiry_dablooms_handle; +enum expiry_dablooms_errno{ + EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL = -1, + EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL = -2, +}; +char* expiry_dablooms_errno_trans(enum expiry_dablooms_errno _errno); +void expiry_dablooms_destroy(struct expiry_dablooms_handle *handle); +struct expiry_dablooms_handle* expiry_dablooms_init(unsigned int capacity, double error_rate, time_t cur_time, int expiry_time); +int expiry_dablooms_element_count_get(struct expiry_dablooms_handle *handle, uint64_t *count); +int expiry_dablooms_add(struct expiry_dablooms_handle *handle, const char *key, size_t len, time_t cur_time); +int expiry_dablooms_search(struct expiry_dablooms_handle *handle, const char *key, size_t len, time_t cur_time); + +#endif diff --git a/bindings/rs-dablooms/dablooms/src/murmur.c b/bindings/rs-dablooms/dablooms/src/murmur.c new file mode 100644 index 0000000..fcf1dc1 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/src/murmur.c @@ -0,0 +1,120 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmur.h" + +#define FORCE_INLINE inline static + +FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#define getblock(x, i) (x[i]) + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint64_t fmix64(uint64_t k) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + int i; + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- diff --git a/bindings/rs-dablooms/dablooms/src/murmur.h b/bindings/rs-dablooms/dablooms/src/murmur.h new file mode 100644 index 0000000..c7547db --- /dev/null +++ b/bindings/rs-dablooms/dablooms/src/murmur.h @@ -0,0 +1,12 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include <stdint.h> + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +#endif // _MURMURHASH3_H_ diff --git a/bindings/rs-dablooms/dablooms/src/test_dablooms.c b/bindings/rs-dablooms/dablooms/src/test_dablooms.c new file mode 100644 index 0000000..5dfdf96 --- /dev/null +++ b/bindings/rs-dablooms/dablooms/src/test_dablooms.c @@ -0,0 +1,342 @@ +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> + +#include "dablooms.h" + +#define CAPACITY 100000 +#define ERROR_RATE .05 + +enum { + TEST_PASS, + TEST_WARN, + TEST_FAIL, +}; + +struct stats { + int true_positives; + int true_negatives; + int false_positives; + int false_negatives; +}; + +static void chomp_line(char *word) +{ + char *p; + if ((p = strchr(word, '\r'))) { + *p = '\0'; + } + if ((p = strchr(word, '\n'))) { + *p = '\0'; + } +} + +static int print_results(struct stats *stats) +{ + float false_positive_rate = (float)stats->false_positives / + (stats->false_positives + stats->true_negatives); + + printf("True positives: %7d" "\n" + "True negatives: %7d" "\n" + "False positives: %7d" "\n" + "False negatives: %7d" "\n" + "False positive rate: %.4f" "\n", + stats->true_positives, + stats->true_negatives, + stats->false_positives, + stats->false_negatives, + false_positive_rate ); + + if (stats->false_negatives > 0) { + printf("TEST FAIL (false negatives exist)\n"); + return TEST_FAIL; + } else if (false_positive_rate > ERROR_RATE) { + printf("TEST WARN (false positive rate too high)\n"); + return TEST_WARN; + } else { + printf("TEST PASS\n"); + return TEST_PASS; + } +} + +static void bloom_score(int positive, int should_positive, struct stats *stats, const char *key) +{ + if (should_positive) { + if (positive) { + stats->true_positives++; + } else { + stats->false_negatives++; + fprintf(stderr, "ERROR: False negative: '%s'\n", key); + } + } else { + if (positive) { + stats->false_positives++; + } else { + stats->true_negatives++; + } + } +} + +int test_counting_remove_reopen(const char *bloom_file, const char *words_file) +{ + FILE *fp; + char word[256]; + counting_bloom_t *bloom; + int i, key_removed; + struct stats results = { 0 }; + + printf("\n* test counting remove & reopen\n"); + + if ((fp = fopen(bloom_file, "r"))) { + fclose(fp); + remove(bloom_file); + } + + if (!(bloom = new_counting_bloom(CAPACITY, ERROR_RATE, bloom_file))) { + fprintf(stderr, "ERROR: Could not create bloom filter\n"); + return TEST_FAIL; + } + if (!(fp = fopen(words_file, "r"))) { + fprintf(stderr, "ERROR: Could not open words file\n"); + return TEST_FAIL; + } + + for (i = 0; fgets(word, sizeof(word), fp) && (i < CAPACITY); i++) { + chomp_line(word); + counting_bloom_add(bloom, word, strlen(word)); + } + + fseek(fp, 0, SEEK_SET); + for (i = 0; fgets(word, sizeof(word), fp) && (i < CAPACITY); i++) { + if (i % 5 == 0) { + chomp_line(word); + counting_bloom_remove(bloom, word, strlen(word)); + } + } + + free_counting_bloom(bloom); + bloom = new_counting_bloom_from_file(CAPACITY, ERROR_RATE, bloom_file); + + fseek(fp, 0, SEEK_SET); + for (i = 0; (fgets(word, sizeof(word), fp)) && (i < CAPACITY); i++) { + chomp_line(word); + key_removed = (i % 5 == 0); + bloom_score(counting_bloom_check(bloom, word, strlen(word)), !key_removed, &results, word); + } + fclose(fp); + + printf("Elements added: %6d" "\n" + "Elements removed: %6d" "\n" + "Total size: %d KiB" "\n\n", + i, i / 5, + (int) bloom->num_bytes / 1024); + + free_counting_bloom(bloom); + + return print_results(&results); +} + +int test_scaling_remove_reopen(const char *bloom_file, const char *words_file) +{ + FILE *fp; + char word[256]; + scaling_bloom_t *bloom; + int i, key_removed; + struct stats results = { 0 }; + + printf("\n* test scaling remove & reopen\n"); + + if ((fp = fopen(bloom_file, "r"))) { + fclose(fp); + remove(bloom_file); + } + + if (!(bloom = new_scaling_bloom(CAPACITY, ERROR_RATE, bloom_file))) { + fprintf(stderr, "ERROR: Could not create bloom filter\n"); + return TEST_FAIL; + } + + if (!(fp = fopen(words_file, "r"))) { + fprintf(stderr, "ERROR: Could not open words file\n"); + return TEST_FAIL; + } + + for (i = 0; fgets(word, sizeof(word), fp); i++) { + chomp_line(word); + scaling_bloom_add(bloom, word, strlen(word), i); + } + + fseek(fp, 0, SEEK_SET); + for (i = 0; fgets(word, sizeof(word), fp); i++) { + if (i % 5 == 0) { + chomp_line(word); + scaling_bloom_remove(bloom, word, strlen(word), i); + } + } + + bitmap_flush(bloom->bitmap); + free_scaling_bloom(bloom); + + bloom = new_scaling_bloom_from_file(CAPACITY, ERROR_RATE, bloom_file); + + fseek(fp, 0, SEEK_SET); + for (i = 0; fgets(word, sizeof(word), fp); i++) { + chomp_line(word); + key_removed = (i % 5 == 0); + bloom_score(scaling_bloom_check(bloom, word, strlen(word)), !key_removed, &results, word); + } + fclose(fp); + + printf("Elements added: %6d" "\n" + "Elements removed: %6d" "\n" + "Total size: %d KiB" "\n\n", + i, i / 5, + (int) bloom->num_bytes / 1024); + + free_scaling_bloom(bloom); + + return print_results(&results); +} + +int test_counting_accuracy(const char *bloom_file, const char *words_file) +{ + FILE *fp; + char word[256]; + counting_bloom_t *bloom; + int i; + struct stats results = { 0 }; + + printf("\n* test counting accuracy\n"); + + if ((fp = fopen(bloom_file, "r"))) { + fclose(fp); + remove(bloom_file); + } + + if (!(bloom = new_counting_bloom(CAPACITY, ERROR_RATE, bloom_file))) { + fprintf(stderr, "ERROR: Could not create bloom filter\n"); + return TEST_FAIL; + } + if (!(fp = fopen(words_file, "r"))) { + fprintf(stderr, "ERROR: Could not open words file\n"); + return TEST_FAIL; + } + + for (i = 0; fgets(word, sizeof(word), fp) && (i < CAPACITY * 2); i++) { + if (i % 2 == 0) { + chomp_line(word); + counting_bloom_add(bloom, word, strlen(word)); + } + } + + fseek(fp, 0, SEEK_SET); + for (i = 0; fgets(word, sizeof(word), fp) && (i < CAPACITY * 2); i++) { + if (i % 2 == 1) { + chomp_line(word); + bloom_score(counting_bloom_check(bloom, word, strlen(word)), 0, &results, word); + } + } + + fclose(fp); + + printf("Elements added: %6d" "\n" + "Elements checked: %6d" "\n" + "Total size: %d KiB" "\n\n", + (i + 1) / 2, i / 2, + (int) bloom->num_bytes / 1024); + + free_counting_bloom(bloom); + + return print_results(&results); +} + +int test_scaling_accuracy(const char *bloom_file, const char *words_file) +{ + FILE *fp; + char word[256]; + scaling_bloom_t *bloom; + int i; + struct stats results = { 0 }; + + printf("\n* test scaling accuracy\n"); + + if ((fp = fopen(bloom_file, "r"))) { + fclose(fp); + remove(bloom_file); + } + + if (!(bloom = new_scaling_bloom(CAPACITY, ERROR_RATE, bloom_file))) { + fprintf(stderr, "ERROR: Could not create bloom filter\n"); + return TEST_FAIL; + } + + if (!(fp = fopen(words_file, "r"))) { + fprintf(stderr, "ERROR: Could not open words file\n"); + return TEST_FAIL; + } + + for (i = 0; fgets(word, sizeof(word), fp); i++) { + if (i % 2 == 0) { + chomp_line(word); + scaling_bloom_add(bloom, word, strlen(word), i); + } + } + + fseek(fp, 0, SEEK_SET); + for (i = 0; fgets(word, sizeof(word), fp); i++) { + if (i % 2 == 1) { + chomp_line(word); + bloom_score(scaling_bloom_check(bloom, word, strlen(word)), 0, &results, word); + } + } + + fclose(fp); + + printf("Elements added: %6d" "\n" + "Elements checked: %6d" "\n" + "Total size: %d KiB" "\n\n", + (i + 1) / 2, i / 2, + (int) bloom->num_bytes / 1024); + + free_scaling_bloom(bloom); + + return print_results(&results); +} + +int main(int argc, char *argv[]) +{ + printf("** dablooms version: %s\n", dablooms_version()); + int i; + int failures = 0, warnings = 0; + + if (argc != 3) { + fprintf(stderr, "Usage: %s <bloom_file> <words_file>\n", argv[0]); + return EXIT_FAILURE; + } + + int (*tests[])(const char *, const char *) = { + test_counting_remove_reopen, + test_counting_accuracy, + test_scaling_remove_reopen, + test_scaling_accuracy, + NULL, + }; + for (i = 0; tests[i] != NULL; i++) { + int result = (tests[i])(argv[1], argv[2]); + if (result == TEST_FAIL) { + failures++; + } else if (result == TEST_WARN) { + warnings++; + } + } + + printf("\n** %d failures, %d warnings\n", failures, warnings); + if (failures) { + return EXIT_FAILURE; + } else { + return EXIT_SUCCESS; + } +} diff --git a/bindings/rs-dablooms/dablooms/test/test_dabloom b/bindings/rs-dablooms/dablooms/test/test_dabloom Binary files differnew file mode 100644 index 0000000..c8fe1fb --- /dev/null +++ b/bindings/rs-dablooms/dablooms/test/test_dabloom diff --git a/bindings/rs-dablooms/dablooms/test/test_dabloom.c b/bindings/rs-dablooms/dablooms/test/test_dabloom.c new file mode 100644 index 0000000..ac8f82a --- /dev/null +++ b/bindings/rs-dablooms/dablooms/test/test_dabloom.c @@ -0,0 +1,152 @@ +#include <sys/stat.h> +#include <stdint.h> +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <fcntl.h> +#include <math.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> +#include <errno.h> +#include <inttypes.h> +#include "dablooms.h" +#include <time.h> + + +//gcc test_dabloom.c -g -Wall -o test_dabloom -I/usr/local/include -L/usr/local/lib -ldablooms -lrt +void test_dev_zero_ftruncate(){ + const char *filename = "./test_ftruncate.txt"; + const char *filename1 = "/dev/zero"; + int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600); + int fd1 = open(filename1, O_RDWR, (mode_t)0600); + int new_size = 1024; + if (ftruncate(fd, new_size) < 0) { + printf("fail at ftruncate, errmsg = %s, fd = %d, filename = %s\n", strerror(errno), fd, filename); + } + else{ + printf("succeed at ftruncate, fd = %d, filename = %s\n", fd, filename); + } + if (ftruncate(fd1, new_size) < 0) { + printf("fail at ftruncate, errmsg = %s, fd = %d, filename = %s\n", strerror(errno), fd1, filename1); + } + else{ + printf("succeed at ftruncate, fd = %d, filename = %s\n", fd1, filename1); + } +} + +void test_mmap_and_ftruncate(){ + const char *filename = "./test_ftruncate.txt"; + int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600); + int new_size = 10; + if (ftruncate(fd, new_size) < 0) + printf("fail at ftruncate, errmsg = %s, fd = %d, filename = %s\n", strerror(errno), fd, filename); + else + printf("secceed ftruncate size = %d\n", new_size); + char *p = (char*)mmap(0, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + *p = 'a'; + printf("*p = %c\n", *p); + //ftruncate size = 0, then memory access error + new_size = 0; + if (ftruncate(fd, new_size) < 0) + printf("fail at ftruncate, errmsg = %s, fd = %d, filename = %s\n", strerror(errno), fd, filename); + else + printf("secceed ftruncate size = %d\n", new_size); + printf("*p = %c\n", *p); +} + + +// void test_private_anonymous_mmap(){ +// int new_size = 100; +// char *p = (char*)mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +// *p = 'a'; +// char *p1 = (char*)mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +// *p1 = 'b'; +// printf("p = %c, p1 = %c\n", *p, *p1); +// } + + +void test_scaling_bloom(){ + int capacity = 100; + double error_rate = 0.05; + + scaling_bloom_t *bloom = new_scaling_bloom(capacity, error_rate); + scaling_bloom_add(bloom, "aaa", 3, 1); + printf("bloom add: key = aaa\n"); + + scaling_bloom_t *bloom1 = new_scaling_bloom(capacity, error_rate); + scaling_bloom_add(bloom1, "bbb", 3, 1); + printf("bloom1 add: key = bbb\n"); + + int hit = scaling_bloom_check(bloom, "aaa", 3); + printf("bloom check: key = aaa, hit = %d\n", hit); + + hit = scaling_bloom_check(bloom, "bbb", 3); + printf("bloom check: key = bbb, hit = %d\n", hit); + + hit = scaling_bloom_check(bloom1, "aaa", 3); + printf("bloom1 check: key = aaa, hit = %d\n", hit); + + hit = scaling_bloom_check(bloom1, "bbb", 3); + printf("bloom1 check: key = bbb, hit = %d\n", hit); +} + +time_t get_current_time() { + struct timespec tp; + if (clock_gettime(CLOCK_REALTIME, &tp) == -1) { + perror("clock_gettime"); + return -1; + } + + time_t current_time = tp.tv_sec; + return current_time; +} + +void test_expiry_dablooms(){ + unsigned int capacity = 100; + double error_rate = 0.001; + int expiry_time = 1; + // time_t cur_time = get_current_time(); + + struct expiry_dablooms_handle *handle = expiry_dablooms_init(capacity, error_rate, get_current_time(), expiry_time); + uint64_t count = -1, i = 0; + for(i = 0; i < 7; i++){ + expiry_dablooms_add(handle, "aaa", 3, get_current_time()); + expiry_dablooms_element_count_get(handle, &count); + printf("element_count = %d\n", (int)count); + } + sleep(4); + printf("After 4s: \n"); + for(i = 0; i < 8; i++){ + expiry_dablooms_add(handle, "bbb", 3, get_current_time()); + expiry_dablooms_element_count_get(handle, &count); + printf("element_count = %d\n", (int)count); + } + sleep(3); + printf("After 7s: \n"); + for(i = 0; i < 9; i++){ + expiry_dablooms_add(handle, "ccc", 3, get_current_time()); + expiry_dablooms_element_count_get(handle, &count); + printf("element_count = %d\n", count); + } + expiry_dablooms_destroy(handle); + /* + int hit = expiry_dablooms_search(handle, "aaa", 3); + uint64_t count = -1; + expiry_dablooms_element_count_get(handle, &count); + printf("key = aaa, hit = %d, element_count = %" PRIu64 "\n", hit, count); + sleep(6); + hit = expiry_dablooms_search(handle, "aaa", 3); + count = -1; + expiry_dablooms_element_count_get(handle, &count); + printf("after 6s, key = aaa, hit = %d, element_count = %" PRIu64 "\n", hit, count); + */ +} + + + +int main(){ + //test_scaling_bloom(); + test_expiry_dablooms(); + return 0; +}
\ No newline at end of file diff --git a/bindings/rs-dablooms/src/dablooms.rs b/bindings/rs-dablooms/src/dablooms.rs new file mode 100644 index 0000000..cf74d38 --- /dev/null +++ b/bindings/rs-dablooms/src/dablooms.rs @@ -0,0 +1,321 @@ +use libc::time_t; + +use crate::dablooms_bind::*; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_int, c_uint}; +use std::ptr::NonNull; +use std::time::{SystemTime, UNIX_EPOCH}; + +// String form rust not contain '\0' in the end, that may cause some problem +// + +pub struct CountingBloomFilter { + bloom: NonNull<counting_bloom_t>, +} + +impl CountingBloomFilter { + pub fn new(capacity: u32, error_rate: f64) -> Result<Self, &'static str> { + let bloom_ptr = unsafe { new_counting_bloom(capacity, error_rate) }; + let bloom = + NonNull::new(bloom_ptr).ok_or("Failed to create CountingBloomFilter,return null")?; + Ok(CountingBloomFilter { bloom }) + } + + // get raw pointer + fn get_raw(&self) -> *mut counting_bloom_t { + self.bloom.as_ptr() + } + + pub fn add<T>(&mut self, key: T) -> Result<(), ()> + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let res = + unsafe { counting_bloom_add(self.get_raw(), key.as_ptr() as *const i8, key.len()) }; + if res == 0 { + Ok(()) + } else { + Err(()) + } + } + + pub fn remove<T>(&mut self, key: T) -> Result<(), ()> + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let res = + unsafe { counting_bloom_remove(self.get_raw(), key.as_ptr() as *const i8, key.len()) }; + if res == 0 { + Ok(()) + } else { + Err(()) + } + } + + pub fn check<T>(&self, key: T) -> bool + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + unsafe { counting_bloom_check(self.get_raw(), key.as_ptr() as *const i8, key.len()) > 0 } + } +} + +impl Drop for CountingBloomFilter { + fn drop(&mut self) { + unsafe { free_counting_bloom(self.get_raw()) }; + } +} + +pub struct ScalingBloomFilter { + bloom: NonNull<scaling_bloom_t>, +} + +impl ScalingBloomFilter { + pub fn new(capacity: u32, error_rate: f64) -> Result<Self, &'static str> { + let bloom_ptr = unsafe { new_scaling_bloom(capacity as c_uint, error_rate) }; + let bloom = + NonNull::new(bloom_ptr).ok_or("Failed to create ScalingBloomFilter,return null")?; + Ok(ScalingBloomFilter { bloom }) + } + + // get raw pointer + fn get_raw(&self) -> *mut scaling_bloom_t { + self.bloom.as_ptr() + } + + pub fn add<T>(&mut self, key: T, id: u64) -> Result<(), ()> + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let res = + unsafe { scaling_bloom_add(self.get_raw(), key.as_ptr() as *const i8, key.len(), id) }; + // scaling_bloom_add returns 1 if the key is ok... + // no idea why it's not 0... + if res == 1 { + Ok(()) + } else { + Err(()) + } + } + + pub fn remove<T>(&mut self, key: T, id: u64) -> Result<(), ()> + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let res = unsafe { + scaling_bloom_remove(self.get_raw(), key.as_ptr() as *const i8, key.len(), id) + }; + // scaling_bloom_remove returns 1 if remove success... + // no idea why it's not 0... + if res == 1 { + Ok(()) + } else { + Err(()) + } + } + + pub fn check<T>(&self, key: T) -> bool + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + unsafe { scaling_bloom_check(self.get_raw(), key.as_ptr() as *const i8, key.len()) > 0 } + } + + pub fn flush(&mut self) -> Result<(), ()> { + let res = unsafe { scaling_bloom_flush(self.get_raw()) }; + if res == 0 { + Ok(()) + } else { + Err(()) + } + } + + pub fn mem_seqnum(&self) -> u64 { + unsafe { scaling_bloom_mem_seqnum(self.get_raw()) } + } + + pub fn disk_seqnum(&self) -> u64 { + unsafe { scaling_bloom_disk_seqnum(self.get_raw()) } + } +} + +impl Drop for ScalingBloomFilter { + fn drop(&mut self) { + unsafe { free_scaling_bloom(self.get_raw()) }; + } +} + +fn get_current_time() -> Result<time_t, c_int> { + let current_time = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| -1)? + .as_secs() as time_t; + Ok(current_time) +} + +pub struct ExpiryBloomFilter { + bloom: NonNull<expiry_dablooms_handle>, +} + +impl ExpiryBloomFilter { + pub fn new(capacity: u32, error_rate: f64, expiry_time: i32) -> Result<Self, &'static str> { + let cur_time = get_current_time(); + let bloom_ptr = unsafe { + expiry_dablooms_init( + capacity as c_uint, + error_rate, + cur_time.unwrap(), + expiry_time as c_int, + ) + }; + let bloom = + NonNull::new(bloom_ptr).ok_or("Failed to create ExpiryBloomFilter,return null")?; + Ok(ExpiryBloomFilter { bloom }) + } + + // get raw pointer + fn get_raw(&self) -> *mut expiry_dablooms_handle { + self.bloom.as_ptr() + } + + pub fn add<T>(&mut self, key: T) -> Result<(), String> + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let cur_time = get_current_time(); + let res = unsafe { + expiry_dablooms_add( + self.get_raw(), + key.as_ptr() as *const i8, + key.len(), + cur_time.unwrap(), + ) + }; + if res == 0 { + Ok(()) + } else { + let err_cstr = unsafe { expiry_dablooms_errno_trans(res) }; + let err_msg = unsafe { CStr::from_ptr(err_cstr).to_string_lossy().into_owned() }; + Err(err_msg) + } + } + + pub fn check<T>(&self, key: T) -> bool + where + T: AsRef<[u8]>, + { + let key = key.as_ref(); + let cur_time = get_current_time(); + unsafe { + expiry_dablooms_search( + self.get_raw(), + key.as_ptr() as *const i8, + key.len(), + cur_time.unwrap(), + ) > 0 + } + } + + pub fn count(&self) -> i32 { + let mut count: u64 = 0; + unsafe { + expiry_dablooms_element_count_get(self.get_raw(), &mut count); + } + return count as i32; + } +} + +impl Drop for ExpiryBloomFilter { + fn drop(&mut self) { + unsafe { expiry_dablooms_destroy(self.get_raw()) }; + } +} + +#[cfg(test)] +mod tests { + use libc::sleep; + + use crate::dablooms::*; + + #[test] + fn test_counting_bloom_filter() { + let mut bloom = CountingBloomFilter::new(1000, 0.01).unwrap(); + let key1 = "hello"; + let key2 = "world"; + let key3 = "rust"; + + assert!(bloom.check(key1) == false); + assert!(bloom.check(key2) == false); + assert!(bloom.check(key3) == false); + bloom.add(key1).unwrap(); + bloom.add(key2).unwrap(); + + drop(key1); + let key = "hello"; + + assert!(bloom.check(key) == true); + assert!(bloom.check(key2) == true); + assert!(bloom.check(key3) == false); + bloom.remove(key).unwrap(); + assert!(bloom.check(key1) == false); + assert!(bloom.check(key2) == true); + assert!(bloom.check(key3) == false); + } + + #[test] + fn test_scaling_bloom_filter() { + let mut bloom = ScalingBloomFilter::new(100, 0.05).unwrap(); + let key1 = "aaa"; + let key2 = "bbb"; + let id1 = 1; + let id2 = 2; + assert!(bloom.check(key1) == false); + assert!(bloom.check(key2) == false); + + bloom.add(key1, id1).unwrap(); + bloom.add(key2, id2).unwrap(); + assert!(bloom.check(key1) == true); + assert!(bloom.check(key2) == true); + bloom.remove(key1, id1).unwrap(); + assert!(bloom.check(key1) == false); + assert!(bloom.check(key2) == true); + } + + #[test] + fn test_expiry_dablooms_filter() { + let capacity: u32 = 100; + let error_rate: f64 = 0.05; + let expiry_secs: i32 = 3; // expiry time 1s + let key1 = "aaa"; + let key2 = "bbb"; + + let mut bloom = ExpiryBloomFilter::new(capacity, error_rate, expiry_secs).unwrap(); + + for _i in 1..7 { + bloom.add(key1).unwrap(); + assert!(bloom.count() == _i); + } + assert!(bloom.check(key1)); + + unsafe { + sleep(2); // sleep 2 sec | all key1's value not expired + } + assert!(bloom.check(key1)); + unsafe { + sleep(2); // sleep 4 sec | all key1's value expired + } + assert!(!bloom.check(key1)); + + for _i in 1..7 { + bloom.add(key2).unwrap(); + assert!(bloom.count() == _i); + } + } +} diff --git a/bindings/rs-dablooms/src/dablooms_bind.rs b/bindings/rs-dablooms/src/dablooms_bind.rs new file mode 100644 index 0000000..3600880 --- /dev/null +++ b/bindings/rs-dablooms/src/dablooms_bind.rs @@ -0,0 +1,156 @@ +#![allow(non_upper_case_globals)] // allow snake case just in this file +use libc::{c_char, c_int, c_long, c_uint, time_t}; + +extern "C" { + pub fn dablooms_version() -> *const c_char; +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct bitmap_t { + pub bytes: usize, + pub array: *mut c_char, +} + +//#[link(name = "dablooms")] +extern "C" { + pub fn bitmap_resize(bitmap: *mut bitmap_t, old_size: usize, new_size: usize) -> *mut bitmap_t; + pub fn new_bitmap(bytes: usize) -> *mut bitmap_t; + // + pub fn bitmap_increment(bitmap: *mut bitmap_t, index: c_uint, offset: c_long) -> c_int; + pub fn bitmap_decrement(bitmap: *mut bitmap_t, index: c_uint, offset: c_long) -> c_int; + pub fn bitmap_check(bitmap: *mut bitmap_t, index: c_uint, offset: c_long) -> c_int; + pub fn bitmap_flush(bitmap: *mut bitmap_t) -> c_int; + pub fn free_bitmap(bitmap: *mut bitmap_t); +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct counting_bloom_header_t { + pub id: u64, + pub count: u32, + pub _pad: u32, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct counting_bloom_t { + pub header: *mut counting_bloom_header_t, + pub capacity: c_uint, + pub offset: c_long, + pub counts_per_func: c_uint, + pub hashes: *mut u32, + pub nfuncs: usize, + pub size: usize, + pub num_bytes: usize, + pub error_rate: f64, + pub bitmap: *mut bitmap_t, +} + +//#[link(name = "dablooms")] +extern "C" { + pub fn free_counting_bloom(bloom: *mut counting_bloom_t) -> c_int; + pub fn new_counting_bloom(capacity: c_uint, error_rate: f64) -> *mut counting_bloom_t; + // + pub fn counting_bloom_add(bloom: *mut counting_bloom_t, s: *const c_char, len: usize) -> c_int; + pub fn counting_bloom_remove( + bloom: *mut counting_bloom_t, + s: *const c_char, + len: usize, + ) -> c_int; + pub fn counting_bloom_check( + bloom: *mut counting_bloom_t, + s: *const c_char, + len: usize, + ) -> c_int; +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scaling_bloom_header_t { + pub max_id: u64, + pub mem_seqnum: u64, + pub disk_seqnum: u64, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scaling_bloom_t { + pub header: *mut scaling_bloom_header_t, + pub capacity: c_uint, + pub num_blooms: c_uint, + pub num_bytes: usize, + pub error_rate: f64, + pub blooms: *mut *mut counting_bloom_t, + pub bitmap: *mut bitmap_t, +} + +//#[link(name = "dablooms")] +extern "C" { + pub fn new_scaling_bloom(capacity: c_uint, error_rate: f64) -> *mut scaling_bloom_t; + pub fn free_scaling_bloom(bloom: *mut scaling_bloom_t) -> c_int; + + pub fn scaling_bloom_add( + bloom: *mut scaling_bloom_t, + s: *const c_char, + len: usize, + id: u64, + ) -> c_int; + pub fn scaling_bloom_remove( + bloom: *mut scaling_bloom_t, + s: *const c_char, + len: usize, + id: u64, + ) -> c_int; + pub fn scaling_bloom_check(bloom: *mut scaling_bloom_t, s: *const c_char, len: usize) -> c_int; + pub fn scaling_bloom_flush(bloom: *mut scaling_bloom_t) -> c_int; + pub fn scaling_bloom_mem_seqnum(bloom: *mut scaling_bloom_t) -> u64; + pub fn scaling_bloom_disk_seqnum(bloom: *mut scaling_bloom_t) -> u64; +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct expiry_dablooms_handle { + _unused: [u8; 0], +} + +// 存疑 +// pub const expiry_dablooms_errno_EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL: expiry_dablooms_errno = -1; +// pub const expiry_dablooms_errno_EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL: expiry_dablooms_errno = -2; +// pub type expiry_dablooms_errno = c_int; + +// #[repr(C)] +// #[derive(Debug, Copy, Clone)] +// pub enum expiry_dablooms_errno { +// EXPIRY_DABLOOMS_ERRNO_BLOOM_NULL = -1, +// EXPIRY_DABLOOMS_ERRNO_NEW_BLOOM_FAIL = -2, +// } + +//#[link(name = "dablooms")] +extern "C" { + // _errno 是 expiry_dablooms_errno 的枚举类型, c 中和 c_int 通用; + pub fn expiry_dablooms_errno_trans(_errno: c_int) -> *mut c_char; + pub fn expiry_dablooms_destroy(handle: *mut expiry_dablooms_handle); + pub fn expiry_dablooms_init( + capacity: c_uint, + error_rate: f64, + cur_time: time_t, + expiry_time: c_int, + ) -> *mut expiry_dablooms_handle; + pub fn expiry_dablooms_element_count_get( + handle: *mut expiry_dablooms_handle, + count: *mut u64, + ) -> c_int; + pub fn expiry_dablooms_add( + handle: *mut expiry_dablooms_handle, + key: *const c_char, + len: usize, + cur_time: time_t, + ) -> c_int; + pub fn expiry_dablooms_search( + handle: *mut expiry_dablooms_handle, + key: *const c_char, + len: usize, + cur_time: time_t, + ) -> c_int; +} diff --git a/bindings/rs-dablooms/src/dablooms_bind_test.rs b/bindings/rs-dablooms/src/dablooms_bind_test.rs new file mode 100644 index 0000000..4fc31eb --- /dev/null +++ b/bindings/rs-dablooms/src/dablooms_bind_test.rs @@ -0,0 +1,458 @@ +use std::{ + mem::{align_of, size_of, MaybeUninit}, + ptr::addr_of, +}; + +use libc::{c_char, c_int, c_long, c_uint, time_t}; + +use crate::dablooms_bind::{ + bitmap_t, counting_bloom_header_t, counting_bloom_t, scaling_bloom_header_t, scaling_bloom_t, +}; + +#[test] +fn bindgen_test_layout_bitmap_t() { + const UNINIT: MaybeUninit<bitmap_t> = MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + size_of::<bitmap_t>(), + 16usize, + concat!("Size of: ", stringify!(bitmap_t)) + ); + assert_eq!( + align_of::<bitmap_t>(), + 8usize, + concat!("Alignment of ", stringify!(bitmap_t)) + ); + assert_eq!( + unsafe { addr_of!((*ptr).bytes) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(bitmap_t), + "::", + stringify!(bytes) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).array) as usize - ptr as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(bitmap_t), + "::", + stringify!(array) + ) + ); +} + +#[test] +fn bindgen_test_layout_counting_bloom_header_t() { + const UNINIT: MaybeUninit<counting_bloom_header_t> = MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + size_of::<counting_bloom_header_t>(), + 16usize, + concat!("Size of: ", stringify!(counting_bloom_header_t)) + ); + assert_eq!( + align_of::<counting_bloom_header_t>(), + 8usize, + concat!("Alignment of ", stringify!(counting_bloom_header_t)) + ); + assert_eq!( + unsafe { addr_of!((*ptr).id) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_header_t), + "::", + stringify!(id) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).count) as usize - ptr as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_header_t), + "::", + stringify!(count) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr)._pad) as usize - ptr as usize }, + 12usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_header_t), + "::", + stringify!(_pad) + ) + ); +} + +#[test] +fn bindgen_test_layout_counting_bloom_t() { + const UNINIT: MaybeUninit<counting_bloom_t> = MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + size_of::<counting_bloom_t>(), + 80usize, + concat!("Size of: ", stringify!(counting_bloom_t)) + ); + assert_eq!( + align_of::<counting_bloom_t>(), + 8usize, + concat!("Alignment of ", stringify!(counting_bloom_t)) + ); + assert_eq!( + unsafe { addr_of!((*ptr).header) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(header) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).capacity) as usize - ptr as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(capacity) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).offset) as usize - ptr as usize }, + 16usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(offset) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).counts_per_func) as usize - ptr as usize }, + 24usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(counts_per_func) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).hashes) as usize - ptr as usize }, + 32usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(hashes) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).nfuncs) as usize - ptr as usize }, + 40usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(nfuncs) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).size) as usize - ptr as usize }, + 48usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(size) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).num_bytes) as usize - ptr as usize }, + 56usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(num_bytes) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).error_rate) as usize - ptr as usize }, + 64usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(error_rate) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).bitmap) as usize - ptr as usize }, + 72usize, + concat!( + "Offset of field: ", + stringify!(counting_bloom_t), + "::", + stringify!(bitmap) + ) + ); +} + +#[test] +fn bindgen_test_layout_scaling_bloom_header_t() { + const UNINIT: MaybeUninit<scaling_bloom_header_t> = MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + size_of::<scaling_bloom_header_t>(), + 24usize, + concat!("Size of: ", stringify!(scaling_bloom_header_t)) + ); + assert_eq!( + align_of::<scaling_bloom_header_t>(), + 8usize, + concat!("Alignment of ", stringify!(scaling_bloom_header_t)) + ); + assert_eq!( + unsafe { addr_of!((*ptr).max_id) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_header_t), + "::", + stringify!(max_id) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).mem_seqnum) as usize - ptr as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_header_t), + "::", + stringify!(mem_seqnum) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).disk_seqnum) as usize - ptr as usize }, + 16usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_header_t), + "::", + stringify!(disk_seqnum) + ) + ); +} + +#[test] +fn bindgen_test_layout_scaling_bloom_t() { + const UNINIT: MaybeUninit<scaling_bloom_t> = MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + size_of::<scaling_bloom_t>(), + 48usize, + concat!("Size of: ", stringify!(scaling_bloom_t)) + ); + assert_eq!( + align_of::<scaling_bloom_t>(), + 8usize, + concat!("Alignment of ", stringify!(scaling_bloom_t)) + ); + assert_eq!( + unsafe { addr_of!((*ptr).header) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(header) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).capacity) as usize - ptr as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(capacity) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).num_blooms) as usize - ptr as usize }, + 12usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(num_blooms) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).num_bytes) as usize - ptr as usize }, + 16usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(num_bytes) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).error_rate) as usize - ptr as usize }, + 24usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(error_rate) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).blooms) as usize - ptr as usize }, + 32usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(blooms) + ) + ); + assert_eq!( + unsafe { addr_of!((*ptr).bitmap) as usize - ptr as usize }, + 40usize, + concat!( + "Offset of field: ", + stringify!(scaling_bloom_t), + "::", + stringify!(bitmap) + ) + ); +} + +// test_dabloom.c +#[cfg(test)] +mod tests { + use core::*; + use libc::{c_uint, sleep, time_t}; + use std::ffi::CString; + use std::os::raw::{c_int, c_long}; + use std::time::{SystemTime, UNIX_EPOCH}; + + use crate::dablooms_bind::{ + expiry_dablooms_add, expiry_dablooms_destroy, expiry_dablooms_element_count_get, + expiry_dablooms_init, new_scaling_bloom, scaling_bloom_add, scaling_bloom_check, + }; + + #[test] + fn test_scaling_bloom() { + let capacity = 100; + let error_rate = 0.05; + + unsafe { + let bloom = new_scaling_bloom(capacity as c_uint, error_rate); + let a = scaling_bloom_add(bloom, CString::new("aaa").unwrap().as_ptr(), 3, 1); + + println!("bloom add: key = aaa\n"); + + let bloom1 = new_scaling_bloom(capacity as c_uint, error_rate); + scaling_bloom_add(bloom1, CString::new("bbb").unwrap().as_ptr(), 3, 1); + println!("bloom1 add: key = bbb\n"); + + let hit = scaling_bloom_check(bloom, CString::new("aaa").unwrap().as_ptr(), 3); + println!("bloom check: key = aaa, hit = {}\n", hit); + + let hit = scaling_bloom_check(bloom, CString::new("bbb").unwrap().as_ptr(), 3); + println!("bloom check: key = bbb, hit = {}\n", hit); + + let hit = scaling_bloom_check(bloom1, CString::new("aaa").unwrap().as_ptr(), 3); + println!("bloom1 check: key = aaa, hit = {}\n", hit); + + let hit = scaling_bloom_check(bloom1, CString::new("bbb").unwrap().as_ptr(), 3); + println!("bloom1 check: key = bbb, hit = {}\n", hit); + } + } + + fn get_current_time() -> Result<time_t, c_int> { + let current_time = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| -1)? + .as_secs() as time_t; + Ok(current_time) + } + + #[test] + fn test_expiry_dablooms() { + let capacity: usize = 100; + let error_rate: f64 = 0.05; + let expiry_secs: i32 = 1; + + let handle = unsafe { + expiry_dablooms_init( + capacity as c_uint, + error_rate, + get_current_time().unwrap(), + expiry_secs as c_int, + ) + }; + + let mut count: u64 = 0; + for i in 0..7 { + unsafe { + expiry_dablooms_add( + handle, + CString::new("aaa").unwrap().as_ptr(), + 3, + get_current_time().unwrap(), + ); + expiry_dablooms_element_count_get(handle, &mut count); + } + println!("element_count = {}", count); + } + unsafe { + sleep(4); + } + println!("After 4s: \n"); + + for i in 0..8 { + unsafe { + expiry_dablooms_add( + handle, + CString::new("bbb").unwrap().as_ptr(), + 3, + get_current_time().unwrap(), + ); + expiry_dablooms_element_count_get(handle, &mut count); + }; + println!("element_count = {}", count); + } + unsafe { + sleep(7); + } + println!("After 7s: \n"); + + for i in 0..9 { + unsafe { + expiry_dablooms_add( + handle, + CString::new("ccc").unwrap().as_ptr(), + 3, + get_current_time().unwrap(), + ); + expiry_dablooms_element_count_get(handle, &mut count); + }; + println!("element_count = {}", count); + } + + unsafe { expiry_dablooms_destroy(handle) }; + } +} diff --git a/bindings/rs-dablooms/src/lib.rs b/bindings/rs-dablooms/src/lib.rs new file mode 100644 index 0000000..c201a99 --- /dev/null +++ b/bindings/rs-dablooms/src/lib.rs @@ -0,0 +1,4 @@ +mod dablooms_bind; +mod dablooms_bind_test; + +pub mod dablooms; |
