summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchenzizhan <[email protected]>2023-08-01 11:40:38 +0800
committerchenzizhan <[email protected]>2023-08-01 11:40:38 +0800
commit917c6b627f3100c0a016d7eefed33c292a138ccd (patch)
tree99a7bf4760e7112f6b390b1a98bf1e374e9fc204
parentc34801109395832ae5169c0374c7a96dac0345ce (diff)
more precisely alloc cell id in merge
-rw-r--r--src/fieldstat.c3
-rw-r--r--src/tags/cell_manager.c1
-rw-r--r--src/tags/heavy_keeper.c82
-rw-r--r--test/test_serialize.cpp3
-rw-r--r--test/unit_test_cell_manager.cpp153
5 files changed, 218 insertions, 24 deletions
diff --git a/src/fieldstat.c b/src/fieldstat.c
index cc983f5..8d8a419 100644
--- a/src/fieldstat.c
+++ b/src/fieldstat.c
@@ -956,11 +956,10 @@ struct fieldstat *fieldstat_dup(const struct fieldstat *instance)
new_instance->valid_cube_arr_length = instance->valid_cube_arr_length;
new_instance->max_n_cube = instance->max_n_cube;
- new_instance->cube = malloc(sizeof(struct fs_cube *) * new_instance->max_n_cube);
+ new_instance->cube = calloc(new_instance->max_n_cube, sizeof(struct fs_cube *));
for (size_t i = 0; i < new_instance->valid_cube_arr_length; i++) {
struct fs_cube *cube = instance->cube[i];
if (cube == NULL) {
- new_instance->cube[i] = NULL;
continue;
}
new_instance->cube[i] = fieldstat_cube_new(cube->shared_tags, cube->n_shared_tags, cube->sampling_mode, cube->max_n_cell);
diff --git a/src/tags/cell_manager.c b/src/tags/cell_manager.c
index 2b54518..01730bb 100644
--- a/src/tags/cell_manager.c
+++ b/src/tags/cell_manager.c
@@ -361,7 +361,6 @@ void cell_manager_merge_topk(struct cell_manager *dest, const struct cell_manage
heavy_keeper_merge_recording_id_details(dest->topk_tag_id_map, src->topk_tag_id_map,
cell_id_popped, n_cell_id_popped, cell_id_old, cell_id_added, n_cell_id_added);
- // todo: 重新排列并且更新,heavy_keeper_merge_recording_id_details 应该用更细致的方法来处理id
memset(dest->id_tag_array, 0, sizeof(struct tag_hash_key *) * dest->id_tag_array_len);
struct heavy_keeper_result *result = heavy_keeper_query(dest->topk_tag_id_map);
for (size_t i = 0; i < result->n_key; i++) {
diff --git a/src/tags/heavy_keeper.c b/src/tags/heavy_keeper.c
index ac83628..dce6940 100644
--- a/src/tags/heavy_keeper.c
+++ b/src/tags/heavy_keeper.c
@@ -314,6 +314,55 @@ unsigned find_maxv_in_sketch(struct heavy_keeper *hk, const struct tag_hash_key
return maxv;
}
+int find_next_unused_cell_id(const int *const *sorted_pst_cell_id_arr, size_t arr_len, int last_find_result, int *next_idx)
+{
+ int cur_idx = *next_idx;
+
+ if (arr_len == 0) {
+ if (last_find_result < 0) {
+ return 0;
+ } else {
+ return last_find_result + 1;
+ }
+ }
+
+ if (cur_idx >= arr_len) {
+ int max_id = *sorted_pst_cell_id_arr[arr_len - 1];
+ if (last_find_result <= max_id) { // this first time reach the end of sorted_pst_cell_id_arr
+ return max_id + 1;
+ } else {
+ return last_find_result + 1;
+ }
+ }
+
+ int expected_id = last_find_result + 1;
+ while (cur_idx < arr_len) {
+ int cur_id = *sorted_pst_cell_id_arr[cur_idx];
+
+ // check if expected_id id is used
+ if (cur_id == expected_id) { // used
+ expected_id++;
+ cur_idx++;
+ } else if (cur_id > expected_id) { // not used
+ *next_idx = cur_idx;
+ return expected_id;
+ } else { // still cannot guarantee expected_id is not used
+ cur_idx++;
+ }
+ }
+
+ // all cell_id in sorted_pst_cell_id_arr is used
+ int max_id = *sorted_pst_cell_id_arr[arr_len - 1];
+ *next_idx = arr_len;
+ return max_id + 1;
+}
+
+int cmp_pst_int(const void *a, const void *b) {
+ const int *pa = *(int **)a;
+ const int *pb = *(int **)b;
+ return *pa - *pb;
+}
+
void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *dest, const struct heavy_keeper *src,
int **cell_id_popped_from_dest_when_merge_src_out, int *cell_id_popped_from_dest_when_merge_src_len_out,
int **cell_id_in_src_before_merge_out, int **cell_id_in_dest_corresponding_out, int *cell_id_in_src_before_merge_len_out)
@@ -329,42 +378,39 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des
unsigned *count_arr = (unsigned *)malloc(max_size * sizeof(unsigned));
struct tag_hash_key **tag_arr = (struct tag_hash_key **)malloc(max_size * sizeof(struct tag_hash_key *));
- int **user_datas = (int **)malloc(max_size * sizeof(int *));
+ int **cell_id_arr_dst = (int **)malloc(size_dest * sizeof(int *));
- sorted_set_dump(ss_dest, count_arr, tag_arr, (void **)user_datas);
+ sorted_set_dump(ss_dest, count_arr, tag_arr, (void **)cell_id_arr_dst);
int cell_id_popped_from_dest_when_merge_src[dest->K];
int cell_id_popped_from_dest_when_merge_src_len = 0;
int cell_id_in_src_before_merge[dest->K];
int cell_id_in_src_before_merge_len = 0;
int cell_id_in_dest_corresponding[dest->K]; // length == cell_id_in_src_before_merge_len
- int new_alloc_id = -1; // used for alloc for new tags from src
-
/* ------------------------------ merge dest ------------------------------ */
for (int i = 0; i < size_dest; i++) {
unsigned maxv = find_maxv_in_sketch(dest, tag_arr[i]);
struct tag_hash_key *tag_final = tag_hash_key_copy(tag_arr[i]);
- const int *pst_cell_id_dest = user_datas[i];
+ const int *pst_cell_id_dest = cell_id_arr_dst[i];
sorted_set_insert(new_rec, tag_final, maxv);
int **pst_cell_id_final = (int **)sorted_set_get_pointer_to_user_data(new_rec, tag_final);
*pst_cell_id_final = (int *)malloc(sizeof(unsigned int));
**pst_cell_id_final = *pst_cell_id_dest;
-
-
- if (new_alloc_id < *pst_cell_id_dest) {
- new_alloc_id = *pst_cell_id_dest;
- }
}
+ qsort(cell_id_arr_dst, size_dest, sizeof(int *), cmp_pst_int);
+
/* ------------------------------ merge source ------------------------------ */
- sorted_set_dump(ss_src, count_arr, tag_arr, (void **)user_datas);
- new_alloc_id++; // the new id should be larger than the largest id in dest
+ int **cell_id_arr_src = (int **)malloc(size_src * sizeof(int *));
+ sorted_set_dump(ss_src, count_arr, tag_arr, (void **)cell_id_arr_src);
+ int last_find_id = -1;
+ int next_idx = 0;
for (int i = 0; i < size_src; i++) {
const struct tag_hash_key *tag_src = tag_arr[i];
int **pst_cell_id_dest = (int **)sorted_set_get_pointer_to_user_data(ss_dest, tag_src);
- int cell_id_src = *user_datas[i];
+ int cell_id_src = *cell_id_arr_src[i];
if (pst_cell_id_dest != NULL) { // the tag is in both dest and src, so has been processed in the previous loop. Just record some info
cell_id_in_src_before_merge[cell_id_in_src_before_merge_len] = cell_id_src;
@@ -384,13 +430,12 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des
if (tmp_ret != 0) { // insert success
int **pst_cell_id_final = (int **)sorted_set_get_pointer_to_user_data(new_rec, tag_final);
*pst_cell_id_final = (int *)malloc(sizeof(unsigned int));
- **pst_cell_id_final = new_alloc_id;
+ last_find_id = find_next_unused_cell_id((const int *const *)cell_id_arr_dst, size_dest, last_find_id, &next_idx);
+ **pst_cell_id_final = last_find_id;
cell_id_in_src_before_merge[cell_id_in_src_before_merge_len] = cell_id_src;
- cell_id_in_dest_corresponding[cell_id_in_src_before_merge_len] = new_alloc_id;
+ cell_id_in_dest_corresponding[cell_id_in_src_before_merge_len] = last_find_id;
cell_id_in_src_before_merge_len++;
- new_alloc_id++;
-
} else {
tag_hash_key_free(tag_final);
}
@@ -398,7 +443,8 @@ void heavy_keeper_merge_sorted_set_recording_id_details(struct heavy_keeper *des
free(count_arr);
free(tag_arr);
- free(user_datas);
+ free(cell_id_arr_dst);
+ free(cell_id_arr_src);
sorted_set_free(ss_dest);
dest->top_K_heap = new_rec;
diff --git a/test/test_serialize.cpp b/test/test_serialize.cpp
index 0086a39..23ec9d3 100644
--- a/test/test_serialize.cpp
+++ b/test/test_serialize.cpp
@@ -34,7 +34,6 @@ TEST(unit_test_serialize, serialize_and_deserialize_fieldstat_instance_comprehen
const char *name = fieldstat_get_metric_name(instance2, cube_id, metric_id);
EXPECT_STREQ(name, "czz_test counter metric");
- printf("going to get cells\n");
int *ret_cell_ids = NULL;
struct fieldstat_tag_list *tag_list = NULL;
@@ -84,8 +83,6 @@ TEST(unit_test_serialize, serialize_and_deserialize_fieldstat_instance_topk)
const char *name = fieldstat_get_metric_name(instance2, cube_id, metric_id);
EXPECT_STREQ(name, "czz_test counter metric");
- printf("going to get cells\n");
-
int *ret_cell_ids = NULL;
struct fieldstat_tag_list *tag_list = NULL;
size_t n_cell = 0;
diff --git a/test/unit_test_cell_manager.cpp b/test/unit_test_cell_manager.cpp
index c2e1108..cec05fd 100644
--- a/test/unit_test_cell_manager.cpp
+++ b/test/unit_test_cell_manager.cpp
@@ -407,6 +407,159 @@ TEST(unit_test_cell_manager, add_with_key_length_is_3_of_diff_types_comprehensiv
tag_hash_key_free(key);
}
+extern "C"
+{
+ int cmp_pst_int(const void *a, const void *b);
+ int find_next_unused_cell_id(const int *const *sorted_pst_cell_id_arr, size_t arr_len, int last_find_result, int *next_idx);
+}
+TEST(unit_test_cell_manager, find_next_id_given_all_cell_id_continuous)
+{
+ int **arr = (int **)malloc(sizeof(int *) * 10);
+ for (int i = 0; i < 10; i++)
+ {
+ arr[i] = (int *)malloc(sizeof(int));
+ *arr[i] = i;
+ }
+
+ printf("arr:\n");
+ for (int i = 0; i < 10; i++)
+ {
+ printf("%d ", *arr[i]);
+ }
+ printf("\n");
+
+ int last_find_result = -1;
+ int next_idx = 0;
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 10);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 11);
+
+ for (int i = 0; i < 10; i++)
+ {
+ free(arr[i]);
+ }
+ free(arr);
+}
+
+TEST(unit_test_cell_manager, find_next_id_given_continuous_hole)
+{
+ int **arr = (int **)malloc(sizeof(int *) * 10);
+ for (int i = 0; i < 3; i++)
+ {
+ arr[i] = (int *)malloc(sizeof(int));
+ *arr[i] = i;
+ }
+ arr[3] = (int *)malloc(sizeof(int));
+ *arr[3] = 5; // 3, 4 is hole
+ for (int i = 4; i < 10; i++)
+ {
+ arr[i] = (int *)malloc(sizeof(int));
+ *arr[i] = i + 2;
+ }
+
+ printf("arr:\n");
+ for (int i = 0; i < 10; i++)
+ {
+ printf("%d ", *arr[i]);
+ }
+ printf("\n");
+
+ int last_find_result = -1;
+ int next_idx = 0;
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 3);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 4);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 12);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 13);
+
+ for (int i = 0; i < 10; i++)
+ {
+ free(arr[i]);
+ }
+ free(arr);
+}
+
+TEST(unit_test_cell_manager, find_next_id_given_holes)
+{
+ int **arr = (int **)malloc(sizeof(int *) * 10);
+ for (int i = 0; i < 10; i++)
+ {
+ arr[i] = (int *)malloc(sizeof(int));
+ *arr[i] = i * 2; // 0, 2, 4, ...
+ }
+
+ printf("arr:\n");
+ for (int i = 0; i < 10; i++)
+ {
+ printf("%d ", *arr[i]);
+ }
+ printf("\n");
+
+ int last_find_result = -1;
+ int next_idx = 0;
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 1);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 3);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 5);
+
+ for (int i = 0; i < 10; i++)
+ {
+ free(arr[i]);
+ }
+ free(arr);
+}
+
+TEST(unit_test_cell_manager, find_next_id_given_holes_at_only_start)
+{
+ int **arr = (int **)malloc(sizeof(int *) * 10);
+ for (int i = 0; i < 10; i++)
+ {
+ arr[i] = (int *)malloc(sizeof(int));
+ *arr[i] = i + 2;
+ }
+
+ printf("arr:\n");
+ for (int i = 0; i < 10; i++)
+ {
+ printf("%d ", *arr[i]);
+ }
+ printf("\n");
+
+ int last_find_result = -1;
+ int next_idx = 0;
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 0);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 1);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 10, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 12);
+
+ for (int i = 0; i < 10; i++)
+ {
+ free(arr[i]);
+ }
+ free(arr);
+}
+
+TEST(unit_test_cell_manager, find_next_id_given_empty_arr)
+{
+ int **arr = (int **)malloc(sizeof(int *) * 0);
+
+ int last_find_result = -1;
+ int next_idx = 0;
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 0, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 0);
+ last_find_result = find_next_unused_cell_id((const int *const *)arr, 0, last_find_result, &next_idx);
+ EXPECT_EQ(last_find_result, 1);
+
+ free(arr);
+}
int main(int argc, char *argv[])
{