diff options
| author | liuwentan <[email protected]> | 2023-03-27 19:15:05 +0800 |
|---|---|---|
| committer | liuwentan <[email protected]> | 2023-03-27 19:15:05 +0800 |
| commit | 4aa3498d79236d8258424c1e8505d1aa63879629 (patch) | |
| tree | 56b8249c37307f04c58397cedc09479c76149e73 /scanner | |
| parent | 73060d1c355aa6a4113295142646557ebe67b0b3 (diff) | |
refactor hs_adapter engine about multi pattern offset
Diffstat (limited to 'scanner')
| -rw-r--r-- | scanner/adapter_hs/adapter_hs.cpp | 339 |
1 files changed, 110 insertions, 229 deletions
diff --git a/scanner/adapter_hs/adapter_hs.cpp b/scanner/adapter_hs/adapter_hs.cpp index 86960f6..9afddc5 100644 --- a/scanner/adapter_hs/adapter_hs.cpp +++ b/scanner/adapter_hs/adapter_hs.cpp @@ -66,55 +66,35 @@ struct adapter_hs { size_t n_expr; size_t n_patterns; struct adapter_hs_runtime *hs_rt; - struct hs_tag *tag_map; - struct pattern_attribute *pat_attr_by_str; - struct pattern_attribute *pat_attr_by_id; + struct pattern_attribute *hs_attr; struct log_handle *logger; }; -struct matched_offset { - unsigned long long start_offset; - unsigned long long end_offset; +struct pattern_offset { + long long start; + long long end; }; -struct matched_pattern { - unsigned long long pattern_id; - struct matched_offset *offsets; - size_t offset_cnt; - size_t offset_size; - UT_hash_handle hh; +struct pattern_attribute { + long long pattern_id; + enum hs_match_mode match_mode; + struct pattern_offset offset; }; -struct matched_pattern_container { - struct matched_pattern *pat_hash; +struct matched_pattern { + UT_array *pattern_ids; + size_t n_patterns; + struct pattern_attribute *ref_hs_attr; + size_t scan_data_len; }; struct adapter_hs_stream { int thread_id; size_t n_expr; - size_t n_patterns; hs_stream_t *literal_stream; hs_stream_t *regex_stream; struct adapter_hs_runtime *hs_rt; - struct matched_pattern_container matched_pat_container; -}; - -struct pattern_attribute { - unsigned long long bool_expr_id; - unsigned long long pattern_id; - enum hs_match_mode match_mode; - int start_offset; - int end_offset; -}; - -struct hs_tag { - char *key; - size_t key_len; - - size_t n_pat_attr; - struct pattern_attribute *pat_attr; - void *user_tag; - UT_hash_handle hh; + struct matched_pattern *matched_pat; }; int _hs_alloc_scratch(hs_database_t *db, hs_scratch_t **scratchs, size_t n_worker_thread, @@ -253,36 +233,6 @@ void adpt_hs_compile_data_free(struct adpt_hs_compile_data *hs_cd) FREE(hs_cd); } -struct hs_tag *hs_tag_new(long long expr_id, size_t n_pattern) -{ - struct hs_tag *tag = ALLOC(struct hs_tag, 1); - - tag->key = ALLOC(char, sizeof(long long)); - memcpy(tag->key, (char *)&expr_id, sizeof(long long)); - tag->key_len = sizeof(long long); - tag->pat_attr = ALLOC(struct pattern_attribute, n_pattern); - tag->n_pat_attr = n_pattern; - - return tag; -} - -void hs_tag_free(struct hs_tag *tag) -{ - if (NULL == tag) { - return; - } - - if (tag->key != NULL) { - FREE(tag->key); - } - - if (tag->pat_attr != NULL) { - FREE(tag->pat_attr); - } - - FREE(tag); -} - void populate_compile_data(struct adpt_hs_compile_data *compile_data, int index, int pattern_id, char *pat, size_t pat_len, int case_sensitive) { @@ -299,7 +249,7 @@ void populate_compile_data(struct adpt_hs_compile_data *compile_data, int index, memcpy(compile_data->patterns[index], pat, pat_len); } -struct bool_expr *bool_exprs_new(struct hs_expr *exprs, size_t n_expr, struct hs_tag **tag_hash, +struct bool_expr *bool_exprs_new(struct hs_expr *exprs, size_t n_expr, struct pattern_attribute *pattern_attr, struct adpt_hs_compile_data *literal_cd, struct adpt_hs_compile_data *regex_cd, size_t *n_pattern) { @@ -314,15 +264,15 @@ struct bool_expr *bool_exprs_new(struct hs_expr *exprs, size_t n_expr, struct hs /* populate adpt_hs_compile_data and bool_expr */ for (size_t i = 0; i < n_expr; i++) { - struct hs_tag *hs_tag = hs_tag_new(exprs[i].expr_id, exprs[i].n_patterns); - hs_tag->user_tag = exprs[i].user_tag; for (size_t j = 0; j < exprs[i].n_patterns; j++) { - hs_tag->pat_attr[j].pattern_id = pattern_index; - hs_tag->pat_attr[j].match_mode = exprs[i].patterns[j].match_mode; - if (exprs[i].patterns[j].match_mode == HS_MATCH_MODE_SUB) { - hs_tag->pat_attr[j].start_offset = exprs[i].patterns[j].start_offset; - hs_tag->pat_attr[j].end_offset = exprs[i].patterns[j].end_offset; + pattern_attr[pattern_index].pattern_id = pattern_index; + pattern_attr[pattern_index].match_mode = exprs[i].patterns[j].match_mode; + + if (pattern_attr[pattern_index].match_mode == HS_MATCH_MODE_SUB || + pattern_attr[pattern_index].match_mode == HS_MATCH_MODE_EXACTLY) { + pattern_attr[pattern_index].offset.start = exprs[i].patterns[j].start_offset; + pattern_attr[pattern_index].offset.end = exprs[i].patterns[j].end_offset; } /* literal pattern */ @@ -347,8 +297,7 @@ struct bool_expr *bool_exprs_new(struct hs_expr *exprs, size_t n_expr, struct hs //printf("expr_id:%lld item_num:%zu\n", exprs[i].expr_id, exprs[i].n_patterns); bool_exprs[i].expr_id = exprs[i].expr_id; bool_exprs[i].item_num = exprs[i].n_patterns; - bool_exprs[i].user_tag = hs_tag; - HASH_ADD_KEYPTR(hh, *tag_hash, hs_tag->key, hs_tag->key_len, hs_tag); + bool_exprs[i].user_tag = exprs[i].user_tag; } *n_pattern = pattern_index; @@ -409,14 +358,14 @@ struct adapter_hs *adapter_hs_new(size_t n_worker_thread, regex_cd = adpt_hs_compile_data_new(regex_pattern_num); } - size_t pattern_cnt = 0; + size_t pattern_cnt = literal_pattern_num + regex_pattern_num; struct adapter_hs *hs_instance = ALLOC(struct adapter_hs, 1); - hs_instance->tag_map = NULL; + hs_instance->hs_attr = ALLOC(struct pattern_attribute, pattern_cnt); hs_instance->logger = logger; hs_instance->n_worker_thread = n_worker_thread; hs_instance->n_expr = n_expr; - struct bool_expr *bool_exprs = bool_exprs_new(exprs, n_expr, &hs_instance->tag_map, + struct bool_expr *bool_exprs = bool_exprs_new(exprs, n_expr, hs_instance->hs_attr, literal_cd, regex_cd, &pattern_cnt); if (NULL == bool_exprs) { return NULL; @@ -534,28 +483,23 @@ void adapter_hs_free(struct adapter_hs *hs_instance) FREE(hs_instance->hs_rt); } - if (hs_instance->tag_map != NULL) { - struct hs_tag *tag = NULL, *tmp_tag = NULL; - HASH_ITER(hh, hs_instance->tag_map, tag, tmp_tag) { - HASH_DEL(hs_instance->tag_map, tag); - hs_tag_free(tag); - } + if (hs_instance->hs_attr != NULL) { + FREE(hs_instance->hs_attr); } FREE(hs_instance); } -int find_same_pattern_offset(struct matched_pattern *matched_pat, unsigned long long from, - unsigned long long to) +static inline int compare_pattern_id(const void *a, const void *b) { - for (size_t i = 0; i < matched_pat->offset_cnt; i++) { - if (matched_pat->offsets[i].start_offset == from && - matched_pat->offsets[i].end_offset == to - 1) { - return 0; - } - } - - return -1; + long long ret = *(const unsigned long long *)a - *(const unsigned long long *)b; + if (ret == 0) { + return 0; + } else if(ret < 0) { + return -1; + } else { + return 1; + } } /** @@ -565,107 +509,69 @@ int matched_event_cb(unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *ctx) { // put id in set - struct matched_pattern_container *matched_pat_container = (struct matched_pattern_container *)ctx; unsigned long long pattern_id = id; - - struct matched_pattern *matched_pat = NULL; - HASH_FIND(hh, matched_pat_container->pat_hash, &pattern_id, sizeof(unsigned long long), matched_pat); - if (matched_pat != NULL) { - // same pattern_id, offset maybe different - int ret = find_same_pattern_offset(matched_pat, from, to); - if (ret < 0) { /* different offset */ - // TODO: use realloc - if (matched_pat->offset_cnt >= matched_pat->offset_size) { - matched_pat->offset_size *= 2; - matched_pat->offsets = (struct matched_offset *)realloc(matched_pat->offsets, - matched_pat->offset_size*sizeof(struct matched_offset)); - } - matched_pat->offsets[matched_pat->offset_cnt].start_offset = from; - matched_pat->offsets[matched_pat->offset_cnt].end_offset = to - 1; - matched_pat->offset_cnt++; - } - return 0; - } else { - // different pattern_id - struct matched_pattern *matched_pat = ALLOC(struct matched_pattern, 1); - matched_pat->pattern_id = pattern_id; - matched_pat->offsets = ALLOC(struct matched_offset, MAX_OFFSET_NUM); - matched_pat->offset_size = MAX_OFFSET_NUM; - matched_pat->offsets[matched_pat->offset_cnt].start_offset = from; - matched_pat->offsets[matched_pat->offset_cnt].end_offset = to - 1; - matched_pat->offset_cnt++; + struct matched_pattern *matched_pat = (struct matched_pattern *)ctx; - HASH_ADD(hh, matched_pat_container->pat_hash, pattern_id, sizeof(unsigned long long), matched_pat); + if (id > matched_pat->n_patterns || id < 0) { + return 0; } - return 0; -} - -int is_real_matched_pattern(struct matched_pattern *matched_pat, enum hs_match_mode match_mode, - size_t data_len, int attr_start_offset, int attr_end_offset) -{ - if (match_mode == HS_MATCH_MODE_EXACTLY) { - for (size_t i = 0; i < matched_pat->offset_cnt; i++) { - if (matched_pat->offsets[i].start_offset == 0 && - matched_pat->offsets[i].end_offset == data_len - 1) { - return 0; - } - } - } else if (match_mode == HS_MATCH_MODE_PREFIX) { - for (size_t i = 0; i < matched_pat->offset_cnt; i++) { - if (matched_pat->offsets[i].start_offset == 0) { - return 0; + // duplicate pattern_id + if (utarray_find(matched_pat->pattern_ids, &pattern_id, compare_pattern_id)) { + return 0; + } + + int ret = 0; + long long start_offset = -1; + long long end_offset = -1; + struct pattern_attribute pat_attr = matched_pat->ref_hs_attr[id]; + switch (pat_attr.match_mode) { + case HS_MATCH_MODE_EXACTLY: + if (0 == from && matched_pat->scan_data_len == to) { + ret = 1; } - } - } else if (match_mode == HS_MATCH_MODE_SUFFIX) { - for (size_t i = 0; i < matched_pat->offset_cnt; i++) { - if (matched_pat->offsets[i].end_offset == data_len - 1) { - return 0; + break; + case HS_MATCH_MODE_SUB: + if (pat_attr.offset.start == -1) { + start_offset = 0; + } else { + start_offset = pat_attr.offset.start; } - } - } else if (match_mode == HS_MATCH_MODE_SUB) { - if (attr_start_offset == -1) { - attr_start_offset = 0; - } - if (attr_end_offset == -1) { - attr_end_offset = (int)data_len - 1; - } - - for (size_t i = 0; i < matched_pat->offset_cnt; i++) { - if (matched_pat->offsets[i].start_offset >= (unsigned long long)attr_start_offset && - matched_pat->offsets[i].end_offset <= (unsigned long long)attr_end_offset) { - return 0; + if (pat_attr.offset.end == -1) { + end_offset = matched_pat->scan_data_len; + } else { + end_offset = pat_attr.offset.end; } - } - } else { - assert(0); - } - return -1; -} - -int hs_tag_validate(struct hs_tag *hs_tag, struct matched_pattern_container *matched_pat_container, - size_t data_len) -{ - /* check if real matched pattern, because pattern match_mode is different */ - for (size_t i = 0; i < hs_tag->n_pat_attr; i++) { - struct matched_pattern *matched_pat = NULL; - unsigned long long pattern_id = hs_tag->pat_attr[i].pattern_id; - HASH_FIND(hh, matched_pat_container->pat_hash, &pattern_id, sizeof(unsigned long long), matched_pat); - if (matched_pat) { - int matched_ret = is_real_matched_pattern(matched_pat, hs_tag->pat_attr[i].match_mode, - data_len, hs_tag->pat_attr[i].start_offset, - hs_tag->pat_attr[i].end_offset); - if (matched_ret < 0) { - return -1; + if (start_offset <= (long long)from && + end_offset >= (long long)(to - 1)) { + ret = 1; } - } + break; + case HS_MATCH_MODE_PREFIX: + if (0 == from) { + ret = 1; + } + break; + case HS_MATCH_MODE_SUFFIX: + if (to == matched_pat->scan_data_len) { + ret = 1; + } + break; + default: + break; } - + + if (1 == ret) { + utarray_push_back(matched_pat->pattern_ids, &pattern_id); + utarray_sort(matched_pat->pattern_ids, compare_pattern_id); + } + return 0; } +UT_icd ut_pattern_id_icd = {sizeof(unsigned long long), NULL, NULL, NULL}; struct adapter_hs_stream *adapter_hs_stream_open(struct adapter_hs *hs_instance, int thread_id) { if (NULL == hs_instance || thread_id < 0) { @@ -677,8 +583,12 @@ struct adapter_hs_stream *adapter_hs_stream_open(struct adapter_hs *hs_instance, hs_stream->thread_id = thread_id; hs_stream->n_expr = hs_instance->n_expr; - hs_stream->n_patterns = hs_instance->n_patterns; hs_stream->hs_rt = hs_instance->hs_rt; + hs_stream->matched_pat = ALLOC(struct matched_pattern, 1); + hs_stream->matched_pat->ref_hs_attr = hs_instance->hs_attr; + hs_stream->matched_pat->n_patterns = hs_instance->n_patterns; + utarray_new(hs_stream->matched_pat->pattern_ids, &ut_pattern_id_icd); + utarray_reserve(hs_stream->matched_pat->pattern_ids, hs_instance->n_patterns); int err_count = 0; if (hs_instance->hs_rt->literal_db != NULL) { @@ -736,28 +646,14 @@ void adapter_hs_stream_close(struct adapter_hs_stream *hs_stream) } } - if (hs_stream->matched_pat_container.pat_hash != NULL) { - struct matched_pattern *pattern = NULL, *tmp_pattern = NULL; - HASH_ITER(hh, hs_stream->matched_pat_container.pat_hash, pattern, tmp_pattern) { - HASH_DELETE(hh, hs_stream->matched_pat_container.pat_hash, pattern); - FREE(pattern); - } - } - - /* hs_stream->hs_rt point to hs_instance->hs_rt which will call free */ + /* hs_stream->hs_rt point to hs_instance->hs_rt which will call free + same as hs_attr */ hs_stream->hs_rt = NULL; - FREE(hs_stream); -} + hs_stream->matched_pat->ref_hs_attr = NULL; + utarray_free(hs_stream->matched_pat->pattern_ids); -static int cmp_ull_p(const void *p1, const void *p2) -{ - if(* (unsigned long long*) p1 > * (unsigned long long*) p2) { - return 1; - } else if(* (unsigned long long*) p1 < * (unsigned long long*) p2) { - return -1; - } else { - return 0; - } + FREE(hs_stream->matched_pat); + FREE(hs_stream); } int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data, size_t data_len, @@ -782,10 +678,12 @@ int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data int err_count = 0; int thread_id = hs_stream->thread_id; + hs_stream->matched_pat->scan_data_len = data_len; + if (hs_stream->literal_stream != NULL) { err = hs_scan_stream(hs_stream->literal_stream, data, data_len, 0, hs_stream->hs_rt->literal_scratchs[thread_id], - matched_event_cb, &hs_stream->matched_pat_container); + matched_event_cb, hs_stream->matched_pat); if (err != HS_SUCCESS) { err_count++; } @@ -794,7 +692,7 @@ int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data if (hs_stream->regex_stream != NULL) { err = hs_scan_stream(hs_stream->regex_stream, data, data_len, 0, hs_stream->hs_rt->regex_scratchs[thread_id], - matched_event_cb, &hs_stream->matched_pat_container); + matched_event_cb, hs_stream->matched_pat); if (err != HS_SUCCESS) { err_count++; } @@ -804,7 +702,7 @@ int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data return -1; } - size_t n_pattern_id = HASH_COUNT(hs_stream->matched_pat_container.pat_hash); + size_t n_pattern_id = utarray_len(hs_stream->matched_pat->pattern_ids); if (0 == n_pattern_id) { *n_hit_result = 0; return 0; @@ -817,19 +715,16 @@ int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data unsigned long long pattern_ids[MAX_SCANNER_HIT_PATTERN_NUM]; memset(pattern_ids, 0, sizeof(unsigned long long) * MAX_SCANNER_HIT_PATTERN_NUM); - int i = 0; - struct matched_pattern *pat = NULL, *tmp_pat = NULL; - HASH_ITER(hh, hs_stream->matched_pat_container.pat_hash, pat, tmp_pat) { + for (size_t i = 0; i < n_pattern_id; i++) { if (i >= MAX_SCANNER_HIT_PATTERN_NUM) { break; } - pattern_ids[i++] = pat->pattern_id; + + unsigned long long pattern_id = *(unsigned long long *)utarray_eltptr(hs_stream->matched_pat->pattern_ids, i); + pattern_ids[i] = pattern_id; } - qsort(pattern_ids, n_pattern_id, sizeof(unsigned long long), cmp_ull_p); int ret = 0; - int real_matched_index = 0; - struct hs_tag *hs_tag = NULL; struct bool_expr_match *bool_matcher_results = ALLOC(struct bool_expr_match, hs_stream->n_expr); int bool_matcher_ret = bool_matcher_match(hs_stream->hs_rt->bm, pattern_ids, n_pattern_id, bool_matcher_results, hs_stream->n_expr); @@ -843,27 +738,13 @@ int adapter_hs_scan_stream(struct adapter_hs_stream *hs_stream, const char *data } for (int index = 0; index < bool_matcher_ret; index++) { - hs_tag = (struct hs_tag *)bool_matcher_results[index].user_tag; - - int tag_ret = hs_tag_validate(hs_tag, &hs_stream->matched_pat_container, data_len); - if (tag_ret < 0) { - //bool_matcher_results[index] is invalid hit, continue - continue; - } - - results[real_matched_index].item_id = bool_matcher_results[index].expr_id; - results[real_matched_index].user_tag = hs_tag->user_tag; - real_matched_index++; + results[index].item_id = bool_matcher_results[index].expr_id; + results[index].user_tag = bool_matcher_results[index].user_tag; } - *n_hit_result = real_matched_index; + *n_hit_result = bool_matcher_ret; next: FREE(bool_matcher_results); - - struct matched_pattern *pattern = NULL, *tmp_pattern = NULL; - HASH_ITER(hh, hs_stream->matched_pat_container.pat_hash, pattern, tmp_pattern) { - HASH_DELETE(hh, hs_stream->matched_pat_container.pat_hash, pattern); - FREE(pattern); - } + utarray_clear(hs_stream->matched_pat->pattern_ids); return ret; } |
