diff options
| author | liuwentan <[email protected]> | 2023-12-20 06:16:23 +0000 |
|---|---|---|
| committer | liuwentan <[email protected]> | 2023-12-20 06:16:23 +0000 |
| commit | 580a594806928096fc220bda0033ed8c209f97b6 (patch) | |
| tree | 7a32185de1d7901b644ba898cb179a5bcc8cb7e4 /scanner/expr_matcher | |
| parent | e65239abe7c579f56c31391224380fe8ea82196a (diff) | |
[PATCH] Add bloom filter to optimize expr_matcher performance
Diffstat (limited to 'scanner/expr_matcher')
| -rw-r--r-- | scanner/expr_matcher/adapter_hs/adapter_hs.cpp | 58 | ||||
| -rw-r--r-- | scanner/expr_matcher/adapter_rs/adapter_rs.cpp | 64 | ||||
| -rw-r--r-- | scanner/expr_matcher/expr_matcher.cpp | 3 | ||||
| -rw-r--r-- | scanner/expr_matcher/expr_matcher_inc.h | 2 |
4 files changed, 121 insertions, 6 deletions
diff --git a/scanner/expr_matcher/adapter_hs/adapter_hs.cpp b/scanner/expr_matcher/adapter_hs/adapter_hs.cpp index 0140b7a..d978df4 100644 --- a/scanner/expr_matcher/adapter_hs/adapter_hs.cpp +++ b/scanner/expr_matcher/adapter_hs/adapter_hs.cpp @@ -18,6 +18,7 @@ #include "adapter_hs.h" #include "uthash/uthash.h" +#include "bloom/bloom.h" #include "maat_utils.h" #include "../../bool_matcher/bool_matcher.h" @@ -66,6 +67,7 @@ struct hs_lit_engine { size_t n_thread; hs_database_t *hs_db; hs_scratch_t **hs_scratches; + struct bloom **blooms; struct hs_lit_stream **streams; struct pattern_attribute *ref_pat_attr; struct log_handle *logger; @@ -76,6 +78,7 @@ struct hs_regex_engine { size_t n_thread; hs_database_t *hs_db; hs_scratch_t **hs_scratches; + struct bloom **blooms; struct hs_regex_stream **streams; struct pattern_attribute *ref_pat_attr; struct log_handle *logger; @@ -163,6 +166,16 @@ void hs_lit_engine_free(void *hs_lit_engine) hs_lit_inst->hs_db = NULL; } + if (hs_lit_inst->blooms != NULL) { + for (i = 0; i < hs_lit_inst->n_thread; i++) { + if (hs_lit_inst->blooms[i] != NULL) { + bloom_free(hs_lit_inst->blooms[i]); + FREE(hs_lit_inst->blooms[i]); + } + } + FREE(hs_lit_inst->blooms); + } + if (hs_lit_inst->hs_scratches != NULL) { for (i = 0; i < hs_lit_inst->n_thread; i++) { if (hs_lit_inst->hs_scratches[i] != NULL) { @@ -197,6 +210,12 @@ void *hs_lit_engine_new(struct expr_rule *rules, size_t n_rule, hs_lit_inst->hs_db = (hs_database_t *)hs_lit_db; hs_lit_inst->logger = logger; hs_lit_inst->ref_pat_attr = pat_attr; + hs_lit_inst->blooms = ALLOC(struct bloom *, n_thread); + for (size_t i = 0; i < n_thread; i++) { + hs_lit_inst->blooms[i] = ALLOC(struct bloom, 1); + bloom_init2(hs_lit_inst->blooms[i], 1024, 0.001); + } + hs_lit_inst->hs_scratches = ALLOC(hs_scratch_t *, n_thread); int ret = hs_alloc_scratches((hs_database_t *)hs_lit_db, hs_lit_inst->hs_scratches, n_thread, logger); @@ -228,6 +247,23 @@ static int matched_event_cb(unsigned int id, unsigned long long from, unsigned long long pattern_id = id; struct matched_pattern *matched_pat = (struct matched_pattern *)ctx; + unsigned long long *tmp_pat_id = NULL; + if (utarray_len(matched_pat->pattern_ids) < (MAX_HIT_PATTERN_NUM / 10)) { + for (size_t i = 0; i < utarray_len(matched_pat->pattern_ids); i++) { + tmp_pat_id = (unsigned long long *)utarray_eltptr(matched_pat->pattern_ids, i); + if (*tmp_pat_id == pattern_id) { + return 0; + } + } + } else { + if (bloom_check(matched_pat->ref_bloom, (char *)&pattern_id, + sizeof(unsigned long long)) == 1) { + return 0; + } + bloom_add(matched_pat->ref_bloom, (char *)&pattern_id, + sizeof(unsigned long long)); + } + if (utarray_len(matched_pat->pattern_ids) >= MAX_HIT_PATTERN_NUM) { return 0; } @@ -302,6 +338,7 @@ void *hs_lit_stream_open(void *hs_lit_engine, int thread_id) lit_stream->thread_id = thread_id; lit_stream->ref_hs_rt = hs_lit_inst; lit_stream->matched_pat = ALLOC(struct matched_pattern, 1); + lit_stream->matched_pat->ref_bloom = hs_lit_inst->blooms[thread_id]; lit_stream->matched_pat->ref_pat_attr = hs_lit_inst->ref_pat_attr; utarray_new(lit_stream->matched_pat->pattern_ids, &ut_hs_pattern_id_icd); utarray_reserve(lit_stream->matched_pat->pattern_ids, MAX_HIT_PATTERN_NUM); @@ -343,6 +380,7 @@ void hs_lit_stream_close(void *hs_lit_stream) /* stream->hs_rt point to hs_instance->hs_rt which will call free same as hs_attr */ stream->ref_hs_rt = NULL; + stream->matched_pat->ref_bloom = NULL; stream->matched_pat->ref_pat_attr = NULL; if (stream->matched_pat->pattern_ids != NULL) { @@ -399,6 +437,7 @@ static int gather_hit_pattern_id(struct matched_pattern *matched_pat, *n_pattern_id = array_index; utarray_clear(matched_pat->pattern_ids); + bloom_reset(matched_pat->ref_bloom); return 0; } @@ -479,6 +518,16 @@ void hs_regex_engine_free(void *hs_regex_engine) hs_regex_inst->hs_db = NULL; } + if (hs_regex_inst->blooms != NULL) { + for (i = 0; i < hs_regex_inst->n_thread; i++) { + if (hs_regex_inst->blooms[i] != NULL) { + bloom_free(hs_regex_inst->blooms[i]); + FREE(hs_regex_inst->blooms[i]); + } + } + FREE(hs_regex_inst->blooms); + } + if (hs_regex_inst->hs_scratches != NULL) { for (i = 0; i < hs_regex_inst->n_thread; i++) { if (hs_regex_inst->hs_scratches[i] != NULL) { @@ -513,8 +562,13 @@ void *hs_regex_engine_new(struct expr_rule *rules, size_t n_rule, hs_regex_inst->hs_db = (hs_database_t *)hs_regex_db; hs_regex_inst->ref_pat_attr = pat_attr; hs_regex_inst->logger = logger; - hs_regex_inst->hs_scratches = ALLOC(hs_scratch_t *, n_thread); + hs_regex_inst->blooms = ALLOC(struct bloom *, n_thread); + for (size_t i = 0; i < n_thread; i++) { + hs_regex_inst->blooms[i] = ALLOC(struct bloom, 1); + bloom_init2(hs_regex_inst->blooms[i], 1024, 0.001); + } + hs_regex_inst->hs_scratches = ALLOC(hs_scratch_t *, n_thread); int ret = hs_alloc_scratches((hs_database_t *)hs_regex_db, hs_regex_inst->hs_scratches, n_thread, logger); @@ -570,6 +624,7 @@ void hs_regex_stream_close(void *hs_regex_stream) /* stream->hs_rt point to hs_instance->hs_rt which will call free same as hs_attr */ stream->ref_hs_rt = NULL; + stream->matched_pat->ref_bloom = NULL; stream->matched_pat->ref_pat_attr = NULL; if (stream->matched_pat->pattern_ids != NULL) { @@ -595,6 +650,7 @@ void *hs_regex_stream_open(void *hs_regex_engine, int thread_id) regex_stream->thread_id = thread_id; regex_stream->ref_hs_rt = hs_regex_inst; regex_stream->matched_pat = ALLOC(struct matched_pattern, 1); + regex_stream->matched_pat->ref_bloom = hs_regex_inst->blooms[thread_id]; regex_stream->matched_pat->ref_pat_attr = hs_regex_inst->ref_pat_attr; utarray_new(regex_stream->matched_pat->pattern_ids, &ut_hs_pattern_id_icd); utarray_reserve(regex_stream->matched_pat->pattern_ids, MAX_HIT_PATTERN_NUM); diff --git a/scanner/expr_matcher/adapter_rs/adapter_rs.cpp b/scanner/expr_matcher/adapter_rs/adapter_rs.cpp index dbc6880..795c3e9 100644 --- a/scanner/expr_matcher/adapter_rs/adapter_rs.cpp +++ b/scanner/expr_matcher/adapter_rs/adapter_rs.cpp @@ -63,7 +63,8 @@ struct rs_regex_stream { struct rs_lit_engine { size_t n_thread; rs_database_t *rs_db; - struct rs_lit_stream **streams; /* per thread */ + struct bloom **blooms; + struct rs_lit_stream **streams; /* per thread */ struct pattern_attribute *ref_pat_attr; struct log_handle *logger; }; @@ -72,7 +73,8 @@ struct rs_lit_engine { struct rs_regex_engine { size_t n_thread; rs_database_t *rs_db; - struct rs_regex_stream **streams; /* per thread */ + struct bloom **blooms; + struct rs_regex_stream **streams; /* per thread */ struct pattern_attribute *ref_pat_attr; struct log_handle *logger; }; @@ -187,6 +189,23 @@ static int matched_event_cb(unsigned int id, int pos_offset, int from, int to, unsigned long long pattern_id = id; struct matched_pattern *matched_pat = (struct matched_pattern *)ctx; + unsigned long long *tmp_pat_id = NULL; + if (utarray_len(matched_pat->pattern_ids) < (MAX_HIT_PATTERN_NUM / 10)) { + for (size_t i = 0; i < utarray_len(matched_pat->pattern_ids); i++) { + tmp_pat_id = (unsigned long long *)utarray_eltptr(matched_pat->pattern_ids, i); + if (*tmp_pat_id == pattern_id) { + return 0; + } + } + } else { + if (bloom_check(matched_pat->ref_bloom, (char *)&pattern_id, + sizeof(unsigned long long)) == 1) { + return 0; + } + bloom_add(matched_pat->ref_bloom, (char *)&pattern_id, + sizeof(unsigned long long)); + } + if (utarray_len(matched_pat->pattern_ids) >= MAX_HIT_PATTERN_NUM) { return 0; } @@ -264,6 +283,7 @@ static int gather_hit_pattern_id(struct matched_pattern *matched_pat, *n_pattern_id = array_index; utarray_clear(matched_pat->pattern_ids); + bloom_reset(matched_pat->ref_bloom); return 0; } @@ -281,6 +301,16 @@ void rs_lit_engine_free(void *rs_lit_engine) rs_lit_inst->rs_db = NULL; } + if (rs_lit_inst->blooms != NULL) { + for (size_t i = 0; i < rs_lit_inst->n_thread; i++) { + if (rs_lit_inst->blooms[i] != NULL) { + bloom_free(rs_lit_inst->blooms[i]); + FREE(rs_lit_inst->blooms[i]); + } + } + FREE(rs_lit_inst->blooms); + } + if (rs_lit_inst->streams != NULL) { for (size_t i = 0; i < rs_lit_inst->n_thread; i++) { if (rs_lit_inst->streams[i] != NULL) { @@ -306,8 +336,14 @@ void *rs_lit_engine_new(struct expr_rule *rules, size_t n_rule, rs_lit_inst->rs_db = (rs_database_t *)rs_lit_db; rs_lit_inst->ref_pat_attr = pat_attr; rs_lit_inst->logger = logger; - rs_lit_inst->streams = ALLOC(struct rs_lit_stream *, n_thread); + rs_lit_inst->blooms = ALLOC(struct bloom *, n_thread); + for (size_t i = 0; i < n_thread; i++) { + rs_lit_inst->blooms[i] = ALLOC(struct bloom, 1); + bloom_init2(rs_lit_inst->blooms[i], 1024, 0.001); + } + + rs_lit_inst->streams = ALLOC(struct rs_lit_stream *, n_thread); for (size_t i = 0; i < n_thread; i++) { rs_lit_inst->streams[i] = (struct rs_lit_stream *)rs_lit_stream_open(rs_lit_inst, i); } @@ -354,6 +390,7 @@ void *rs_lit_stream_open(void *rs_lit_engine, int thread_id) lit_stream->thread_id = thread_id; lit_stream->ref_rs_rt = rs_lit_inst; lit_stream->matched_pat = ALLOC(struct matched_pattern, 1); + lit_stream->matched_pat->ref_bloom = rs_lit_inst->blooms[thread_id]; lit_stream->matched_pat->ref_pat_attr = rs_lit_inst->ref_pat_attr; utarray_new(lit_stream->matched_pat->pattern_ids, &ut_rs_pattern_id_icd); utarray_reserve(lit_stream->matched_pat->pattern_ids, MAX_HIT_PATTERN_NUM); @@ -387,6 +424,7 @@ void rs_lit_stream_close(void *rs_lit_stream) /* rs_stream->rs_rt point to rs_instance->rs_rt which will call free same as rs_attr */ lit_stream->ref_rs_rt = NULL; + lit_stream->matched_pat->ref_bloom = NULL; lit_stream->matched_pat->ref_pat_attr = NULL; if (lit_stream->matched_pat->pattern_ids != NULL) { @@ -434,6 +472,16 @@ void rs_regex_engine_free(void *rs_regex_engine) rs_regex_inst->rs_db = NULL; } + if (rs_regex_inst->blooms != NULL) { + for (size_t i = 0; i < rs_regex_inst->n_thread; i++) { + if (rs_regex_inst->blooms[i] != NULL) { + bloom_free(rs_regex_inst->blooms[i]); + FREE(rs_regex_inst->blooms[i]); + } + } + FREE(rs_regex_inst->blooms); + } + if (rs_regex_inst->streams != NULL) { for (size_t i = 0; i < rs_regex_inst->n_thread; i++) { if (rs_regex_inst->streams[i] != NULL) { @@ -459,8 +507,14 @@ void *rs_regex_engine_new(struct expr_rule *rules, size_t n_rule, rs_regex_inst->rs_db = (rs_database_t *)rs_regex_db; rs_regex_inst->ref_pat_attr = pat_attr; rs_regex_inst->logger = logger; + + rs_regex_inst->blooms = ALLOC(struct bloom *, n_thread); + for (size_t i = 0; i < n_thread; i++) { + rs_regex_inst->blooms[i] = ALLOC(struct bloom, 1); + bloom_init2(rs_regex_inst->blooms[i], 1024, 0.001); + } + rs_regex_inst->streams = ALLOC(struct rs_regex_stream *, n_thread); - for (size_t i = 0; i < n_thread; i++) { rs_regex_inst->streams[i] = (struct rs_regex_stream *)rs_regex_stream_open(rs_regex_inst, i); } @@ -507,6 +561,7 @@ void *rs_regex_stream_open(void *rs_regex_engine, int thread_id) regex_stream->thread_id = thread_id; regex_stream->ref_rs_rt = rs_regex_inst; regex_stream->matched_pat = ALLOC(struct matched_pattern, 1); + regex_stream->matched_pat->ref_bloom = rs_regex_inst->blooms[thread_id]; regex_stream->matched_pat->ref_pat_attr = rs_regex_inst->ref_pat_attr; utarray_new(regex_stream->matched_pat->pattern_ids, &ut_rs_pattern_id_icd); utarray_reserve(regex_stream->matched_pat->pattern_ids, MAX_HIT_PATTERN_NUM); @@ -540,6 +595,7 @@ void rs_regex_stream_close(void *rs_regex_stream) /* rs_stream->rs_rt point to rs_instance->rs_rt which will call free same as rs_attr */ regex_stream->ref_rs_rt = NULL; + regex_stream->matched_pat->ref_bloom = NULL; regex_stream->matched_pat->ref_pat_attr = NULL; if (regex_stream->matched_pat->pattern_ids != NULL) { diff --git a/scanner/expr_matcher/expr_matcher.cpp b/scanner/expr_matcher/expr_matcher.cpp index 16ec4ee..3a8f9e9 100644 --- a/scanner/expr_matcher/expr_matcher.cpp +++ b/scanner/expr_matcher/expr_matcher.cpp @@ -13,6 +13,7 @@ #include <sys/syscall.h> #include "log/log.h" +#include "bloom/bloom.h" #include "maat_utils.h" #include "../bool_matcher/bool_matcher.h" #include "expr_matcher_inc.h" @@ -409,7 +410,7 @@ static int expr_matcher_bool_matcher_match(struct bool_matcher *bm, struct bool_ unsigned long long unique_pat_ids[n_hit_pattern]; size_t n_unique_pat_id = 0; - qsort(hit_pattern_ids, n_hit_pattern, sizeof(unsigned long long *), compare_pattern_id); + qsort(hit_pattern_ids, n_hit_pattern, sizeof(unsigned long long), compare_pattern_id); for (size_t i = 0; i < n_hit_pattern; i++) { tmp_pat_id = hit_pattern_ids[i]; diff --git a/scanner/expr_matcher/expr_matcher_inc.h b/scanner/expr_matcher/expr_matcher_inc.h index 57782ed..c575508 100644 --- a/scanner/expr_matcher/expr_matcher_inc.h +++ b/scanner/expr_matcher/expr_matcher_inc.h @@ -18,6 +18,7 @@ extern "C" #include <stddef.h> #include "uthash/utarray.h" +#include "bloom/bloom.h" #include "expr_matcher.h" #define MAX_HIT_PATTERN_NUM 1024 @@ -36,6 +37,7 @@ struct pattern_attribute { struct matched_pattern { UT_array *pattern_ids; + struct bloom *ref_bloom; struct pattern_attribute *ref_pat_attr; size_t scan_data_len; }; |
