diff options
| author | chenzizhan <[email protected]> | 2023-08-01 11:40:38 +0800 |
|---|---|---|
| committer | chenzizhan <[email protected]> | 2023-08-01 11:40:38 +0800 |
| commit | 917c6b627f3100c0a016d7eefed33c292a138ccd (patch) | |
| tree | 99a7bf4760e7112f6b390b1a98bf1e374e9fc204 | |
| parent | c34801109395832ae5169c0374c7a96dac0345ce (diff) | |
more precisely alloc cell id in merge
| -rw-r--r-- | src/fieldstat.c | 3 | ||||
| -rw-r--r-- | src/tags/cell_manager.c | 1 | ||||
| -rw-r--r-- | src/tags/heavy_keeper.c | 82 | ||||
| -rw-r--r-- | test/test_serialize.cpp | 3 | ||||
| -rw-r--r-- | test/unit_test_cell_manager.cpp | 153 |
5 files changed, 218 insertions, 24 deletions
diff --git a/src/fieldstat.c b/src/fieldstat.c index cc983f5..8d8a419 100644 --- a/src/fieldstat.c +++ b/src/fieldstat.c @@ -956,11 +956,10 @@ struct fieldstat *fieldstat_dup(const struct fieldstat *instance) new_instance->valid_cube_arr_length = instance->valid_cube_arr_length; new_instance->max_n_cube = instance->max_n_cube; - new_instance->cube = malloc(sizeof(struct fs_cube *) * new_instance->max_n_cube); + new_instance->cube = calloc(new_instance->max_n_cube, sizeof(struct fs_cube *)); for (size_t i = 0; i < new_instance->valid_cube_arr_length; i++) { struct fs_cube *cube = instance->cube[i]; if (cube == NULL) { - new_instance->cube[i] = NULL; continue; } new_instance->cube[i] = fieldstat_cube_new(cube->shared_tags, cube->n_shared_tags, cube->sampling_mode, cube->max_n_cell); diff --git a/src/tags/cell_manager.c b/src/tags/cell_manager.c index 2b54518..01730bb 100644 --- a/src/tags/cell_manager.c +++ b/src/tags/cell_manager.c @@ -361,7 +361,6 @@ void cell_manager_merge_topk(struct cell_manager *dest, const struct cell_manage heavy_keeper_merge_recording_id_details(dest->topk_tag_id_map, src->topk_tag_id_map, cell_id_popped, n_cell_id_popped, cell_id_old, cell_id_added, n_cell_id_added); - // todo: 重新排列并且更新,heavy_keeper_merge_recording_id_details 应该用更细致的方法来处理id memset(dest->id_tag_array, 0, sizeof(struct tag_hash_key *) * dest->id_tag_array_len); struct heavy_keeper_result *result = heavy_keeper_query(dest->topk_tag_id_map); for (size_t i = 0; i < result->n_key; i++) { diff --git a/src/tags/heavy_keeper.c b/src/tags/heavy_keeper.c index ac83628..dce6940 100644 --- a/src/tags/heavy_keeper.c +++ b/src/tags/heavy_keeper.c @@ -314,6 +314,55 @@ unsigned find_maxv_in_sketch(struct heavy_keeper *hk, const struct tag_hash_key return maxv; } +int find_next_unused_cell_id(const int *const *sorted_pst_cell_id_arr, size_t arr_len, int last_find_result, int *next_idx) +{ + int cur_idx = *next_idx; + + if (arr_len == 0) { + if (last_find_result < 0) { + return 0; + } else { + return last_find_result + 1; + } + } + + if (cur_idx >= arr_len) { + int max_id = *sorted_pst_cell_id_arr[arr_len - 1]; + if (last_find_result <= max_id) { // this first time reach the end of sorted_pst_cell_id_arr + return max_id + 1; + } else { + return last_find_result + 1; + } + } + + int expected_id = last_find_result + 1; + while (cur_idx < arr_len) { + int cur_id = *sorted_pst_cell_id_arr[cur_idx]; + + // check if expected_id id is used + if (cur_id == expected_id) { // used + expected_id++; + cur_idx++; + } else if (cur_id > expected_id) { // not used + *next_idx = cur_idx; + return expected_id; + } else { // still cannot guarantee expected_id is not used + cur_idx++; + } + } + + // all cell_id in sorted_pst_cell_id_arr is used + int max_id = *sorted_pst_cell_id_arr[arr_len - 1]; + *next_idx = arr_len; + return max_id + 1; +} + +int cmp_pst_int(const void *a, const void *b) { + const int *pa = *(int **)a; + const int *pb = *(int **)b; + return *pa - *pb; +} + void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *dest, const struct heavy_keeper *src, int **cell_id_popped_from_dest_when_merge_src_out, int *cell_id_popped_from_dest_when_merge_src_len_out, int **cell_id_in_src_before_merge_out, int **cell_id_in_dest_corresponding_out, int *cell_id_in_src_before_merge_len_out) @@ -329,42 +378,39 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des unsigned *count_arr = (unsigned *)malloc(max_size * sizeof(unsigned)); struct tag_hash_key **tag_arr = (struct tag_hash_key **)malloc(max_size * sizeof(struct tag_hash_key *)); - int **user_datas = (int **)malloc(max_size * sizeof(int *)); + int **cell_id_arr_dst = (int **)malloc(size_dest * sizeof(int *)); - sorted_set_dump(ss_dest, count_arr, tag_arr, (void **)user_datas); + sorted_set_dump(ss_dest, count_arr, tag_arr, (void **)cell_id_arr_dst); int cell_id_popped_from_dest_when_merge_src[dest->K]; int cell_id_popped_from_dest_when_merge_src_len = 0; int cell_id_in_src_before_merge[dest->K]; int cell_id_in_src_before_merge_len = 0; int cell_id_in_dest_corresponding[dest->K]; // length == cell_id_in_src_before_merge_len - int new_alloc_id = -1; // used for alloc for new tags from src - /* ------------------------------ merge dest ------------------------------ */ for (int i = 0; i < size_dest; i++) { unsigned maxv = find_maxv_in_sketch(dest, tag_arr[i]); struct tag_hash_key *tag_final = tag_hash_key_copy(tag_arr[i]); - const int *pst_cell_id_dest = user_datas[i]; + const int *pst_cell_id_dest = cell_id_arr_dst[i]; sorted_set_insert(new_rec, tag_final, maxv); int **pst_cell_id_final = (int **)sorted_set_get_pointer_to_user_data(new_rec, tag_final); *pst_cell_id_final = (int *)malloc(sizeof(unsigned int)); **pst_cell_id_final = *pst_cell_id_dest; - - - if (new_alloc_id < *pst_cell_id_dest) { - new_alloc_id = *pst_cell_id_dest; - } } + qsort(cell_id_arr_dst, size_dest, sizeof(int *), cmp_pst_int); + /* ------------------------------ merge source ------------------------------ */ - sorted_set_dump(ss_src, count_arr, tag_arr, (void **)user_datas); - new_alloc_id++; // the new id should be larger than the largest id in dest + int **cell_id_arr_src = (int **)malloc(size_src * sizeof(int *)); + sorted_set_dump(ss_src, count_arr, tag_arr, (void **)cell_id_arr_src); + int last_find_id = -1; + int next_idx = 0; for (int i = 0; i < size_src; i++) { const struct tag_hash_key *tag_src = tag_arr[i]; int **pst_cell_id_dest = (int **)sorted_set_get_pointer_to_user_data(ss_dest, tag_src); - int cell_id_src = *user_datas[i]; + int cell_id_src = *cell_id_arr_src[i]; if (pst_cell_id_dest != NULL) { // the tag is in both dest and src, so has been processed in the previous loop. Just record some info cell_id_in_src_before_merge[cell_id_in_src_before_merge_len] = cell_id_src; @@ -384,13 +430,12 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des if (tmp_ret != 0) { // insert success int **pst_cell_id_final = (int **)sorted_set_get_pointer_to_user_data(new_rec, tag_final); *pst_cell_id_final = (int *)malloc(sizeof(unsigned int)); - **pst_cell_id_final = new_alloc_id; + last_find_id = find_next_unused_cell_id((const int *const *)cell_id_arr_dst, size_dest, last_find_id, &next_idx); + **pst_cell_id_final = last_find_id; cell_id_in_src_before_merge[cell_id_in_src_before_merge_len] = cell_id_src; - cell_id_in_dest_corresponding[cell_id_in_src_before_merge_len] = new_alloc_id; + cell_id_in_dest_corresponding[cell_id_in_src_before_merge_len] = last_find_id; cell_id_in_src_before_merge_len++; - new_alloc_id++; - } else { tag_hash_key_free(tag_final); } @@ -398,7 +443,8 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des free(count_arr); free(tag_arr); - free(user_datas); + free(cell_id_arr_dst); + free(cell_id_arr_src); sorted_set_free(ss_dest); dest->top_K_heap = new_rec; diff --git a/test/test_serialize.cpp b/test/test_serialize.cpp index 0086a39..23ec9d3 100644 --- a/test/test_serialize.cpp +++ b/test/test_serialize.cpp @@ -34,7 +34,6 @@ TEST(unit_test_serialize, serialize_and_deserialize_fieldstat_instance_comprehen const char *name = fieldstat_get_metric_name(instance2, cube_id, metric_id); EXPECT_STREQ(name, "czz_test counter metric"); - printf("going to get cells\n"); int *ret_cell_ids = NULL; struct fieldstat_tag_list *tag_list = NULL; @@ -84,8 +83,6 @@ TEST(unit_test_serialize, serialize_and_deserialize_fieldstat_instance_topk) const char *name = fieldstat_get_metric_name(instance2, cube_id, metric_id); EXPECT_STREQ(name, "czz_test counter metric"); - printf("going to get cells\n"); - int *ret_cell_ids = NULL; struct fieldstat_tag_list *tag_list = NULL; size_t n_cell = 0; diff --git a/test/unit_test_cell_manager.cpp b/test/unit_test_cell_manager.cpp index c2e1108..cec05fd 100644 --- a/test/unit_test_cell_manager.cpp +++ b/test/unit_test_cell_manager.cpp @@ -407,6 +407,159 @@ TEST(unit_test_cell_manager, add_with_key_length_is_3_of_diff_types_comprehensiv tag_hash_key_free(key); } +extern "C" +{ + int cmp_pst_int(const void *a, const void *b); + int find_next_unused_cell_id(const int *const *sorted_pst_cell_id_arr, size_t arr_len, int last_find_result, int *next_idx); +} +TEST(unit_test_cell_manager, find_next_id_given_all_cell_id_continuous) +{ + int **arr = (int **)malloc(sizeof(int *) * 10); + for (int i = 0; i < 10; i++) + { + arr[i] = (int *)malloc(sizeof(int)); + *arr[i] = i; + } + + printf("arr:\n"); + for (int i = 0; i < 10; i++) + { + printf("%d ", *arr[i]); + } + printf("\n"); + + int last_find_result = -1; + int next_idx = 0; + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 10); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 11); + + for (int i = 0; i < 10; i++) + { + free(arr[i]); + } + free(arr); +} + +TEST(unit_test_cell_manager, find_next_id_given_continuous_hole) +{ + int **arr = (int **)malloc(sizeof(int *) * 10); + for (int i = 0; i < 3; i++) + { + arr[i] = (int *)malloc(sizeof(int)); + *arr[i] = i; + } + arr[3] = (int *)malloc(sizeof(int)); + *arr[3] = 5; // 3, 4 is hole + for (int i = 4; i < 10; i++) + { + arr[i] = (int *)malloc(sizeof(int)); + *arr[i] = i + 2; + } + + printf("arr:\n"); + for (int i = 0; i < 10; i++) + { + printf("%d ", *arr[i]); + } + printf("\n"); + + int last_find_result = -1; + int next_idx = 0; + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 3); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 4); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 12); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 13); + + for (int i = 0; i < 10; i++) + { + free(arr[i]); + } + free(arr); +} + +TEST(unit_test_cell_manager, find_next_id_given_holes) +{ + int **arr = (int **)malloc(sizeof(int *) * 10); + for (int i = 0; i < 10; i++) + { + arr[i] = (int *)malloc(sizeof(int)); + *arr[i] = i * 2; // 0, 2, 4, ... + } + + printf("arr:\n"); + for (int i = 0; i < 10; i++) + { + printf("%d ", *arr[i]); + } + printf("\n"); + + int last_find_result = -1; + int next_idx = 0; + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 1); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 3); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 5); + + for (int i = 0; i < 10; i++) + { + free(arr[i]); + } + free(arr); +} + +TEST(unit_test_cell_manager, find_next_id_given_holes_at_only_start) +{ + int **arr = (int **)malloc(sizeof(int *) * 10); + for (int i = 0; i < 10; i++) + { + arr[i] = (int *)malloc(sizeof(int)); + *arr[i] = i + 2; + } + + printf("arr:\n"); + for (int i = 0; i < 10; i++) + { + printf("%d ", *arr[i]); + } + printf("\n"); + + int last_find_result = -1; + int next_idx = 0; + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 0); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 1); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 12); + + for (int i = 0; i < 10; i++) + { + free(arr[i]); + } + free(arr); +} + +TEST(unit_test_cell_manager, find_next_id_given_empty_arr) +{ + int **arr = (int **)malloc(sizeof(int *) * 0); + + int last_find_result = -1; + int next_idx = 0; + last_find_result = find_next_unused_cell_id((const int *const *)arr, 0, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 0); + last_find_result = find_next_unused_cell_id((const int *const *)arr, 0, last_find_result, &next_idx); + EXPECT_EQ(last_find_result, 1); + + free(arr); +} int main(int argc, char *argv[]) { |
