summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordoufenghu <[email protected]>2024-09-10 20:05:06 +0800
committerdoufenghu <[email protected]>2024-09-10 20:05:06 +0800
commit4bb87b62cd7d3dd12bd19e643aaffda53e35e57a (patch)
tree90eb508ca3cd69e9a48531237c9e713e704a8a1c
parentaf6b8ab5e619be383b0597a2a8aaa47341d05f2f (diff)
[Feature][docs] Add split operator description.
-rw-r--r--docs/images/groot_stream_architecture.jpgbin5263679 -> 5472871 bytes
-rw-r--r--docs/processor/aggregate-processor.md23
-rw-r--r--docs/processor/split-processor.md49
-rw-r--r--groot-examples/end-to-end-example/src/main/resources/examples/grootstream_job_split_test.yaml97
4 files changed, 158 insertions, 11 deletions
diff --git a/docs/images/groot_stream_architecture.jpg b/docs/images/groot_stream_architecture.jpg
index d8f1d4b..28b553b 100644
--- a/docs/images/groot_stream_architecture.jpg
+++ b/docs/images/groot_stream_architecture.jpg
Binary files differ
diff --git a/docs/processor/aggregate-processor.md b/docs/processor/aggregate-processor.md
index 5ab0ae0..afc26f6 100644
--- a/docs/processor/aggregate-processor.md
+++ b/docs/processor/aggregate-processor.md
@@ -10,17 +10,18 @@ Within the pipeline, events are processed by each Function in order, top‑>down
## Options
Note:Default will output internal fields `__window_start_timestamp` and `__window_end_timestamp` if not set output_fields.
-| name | type | required | default value |
-|--------------------------|--------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| type | String | Yes | The type of the processor, now only support `com.geedgenetworks.core.processor.aggregate.AggregateProcessor` |
-| output_fields | Array | No | Array of String. The list of fields that need to be kept. Fields not in the list will be removed. |
-| remove_fields | Array | No | Array of String. The list of fields that need to be removed. |
-| group_by_fields | Array | yes | Array of String. The list of fields that need to be grouped. |
-| window_type | String | yes | The type of window, now only support `tumbling_processing_time`, `tumbling_event_time`, `sliding_processing_time`, `sliding_event_time`. if window_type is `tumbling/sliding_event_time,` you need to set watermark. |
-| window_size | Long | yes | The duration of the window in seconds. |
-| window_slide | Long | yes | The duration of the window slide in seconds. |
-| window_timestamp_field | String | No | Set the output timestamp field name, with the unit in seconds. It is mapped to the internal field __window_start_timestamp. |
-| functions | Array | No | Array of Object. The list of functions that need to be applied to the data. |
+| name | type | required | default value |
+|------------------------|-----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| type | String | Yes | The type of the processor, now only support `com.geedgenetworks.core.processor.aggregate.AggregateProcessor` |
+| output_fields | Array | No | Array of String. The list of fields that need to be kept. Fields not in the list will be removed. |
+| remove_fields | Array | No | Array of String. The list of fields that need to be removed. |
+| group_by_fields | Array | yes | Array of String. The list of fields that need to be grouped. |
+| window_type | String | yes | The type of window, now only support `tumbling_processing_time`, `tumbling_event_time`, `sliding_processing_time`, `sliding_event_time`. if window_type is `tumbling/sliding_event_time,` you need to set watermark. |
+| window_size | Long | yes | The duration of the window in seconds. |
+| window_slide | Long | yes | The duration of the window slide in seconds. |
+| window_timestamp_field | String | No | Set the output timestamp field name, with the unit in seconds. It is mapped to the internal field __window_start_timestamp. |
+| mini_batch | Boolean | No | Specifies whether to enable local aggregate optimization. The default value is false. This can significantly reduce the state overhead and get a better throughput. |
+| functions | Array | No | Array of Object. The list of functions that need to be applied to the data. |
## Usage Example
diff --git a/docs/processor/split-processor.md b/docs/processor/split-processor.md
new file mode 100644
index 0000000..e1a1163
--- /dev/null
+++ b/docs/processor/split-processor.md
@@ -0,0 +1,49 @@
+# Split Processor
+
+> Split the output of a data processing pipeline into multiple streams based on certain conditions.
+
+## Description
+
+Using the flink side Outputs send data from a stream to multiple downstream consumers. This is useful when you want to separate or filter certain elements of a stream without disrupting the main processing flow. For example, side outputs can be used for error handling, conditional routing, or extracting specific subsets of the data.
+
+## Options
+
+| name | type | required | default value |
+|-------------------|--------|----------|--------------------------------------------------------------------------------------------|
+| type | String | Yes | The type of the processor, now only support ` com.geedgenetworks.core.split.SplitOperator` |
+| rules | Array | Yes | Array of Object. Defining rules for labeling Side Output Tag |
+| [rule.]tag | String | Yes | The tag name of the side output |
+| [rule.]expression | String | Yes | The expression to evaluate the event. |
+
+## Usage Example
+
+This example uses a split processor to split the data into two streams based on the value of the `decoded_as` field.
+
+```yaml
+splits:
+ decoded_as_split:
+ type: split
+ rules:
+ - tag: http_tag
+ expression: event.decoded_as == 'HTTP'
+ - tag: dns_tag
+ expression: event.decoded_as == 'DNS'
+
+
+topology:
+ - name: inline_source
+ downstream: [decoded_as_split]
+ - name: decoded_as_split
+ tags: [http_tag, dns_tag]
+ downstream: [ projection_processor, aggregate_processor]
+ - name: projection_processor
+ downstream: [ print_sink ]
+ - name: aggregate_processor
+ downstream: [ print_sink ]
+ - name: print_sink
+ downstream: []
+```
+
+
+
+
diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/grootstream_job_split_test.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/grootstream_job_split_test.yaml
new file mode 100644
index 0000000..9bb2900
--- /dev/null
+++ b/groot-examples/end-to-end-example/src/main/resources/examples/grootstream_job_split_test.yaml
@@ -0,0 +1,97 @@
+sources:
+ inline_source:
+ type : inline
+ fields: # [array of object] Field List, if not set, all fields(Map<String, Object>) will be output.
+ properties:
+ data: '[{"sessions":1,"mail_attachment_name_charset":"GBK","mail_attachment_name":"aGVsbG8=","packet_capture_file":"test","ssl_sni":"www.google.com","decoded_as":"BASE","ssl_san":"www.google.com","__timestamp":1705568517095,"client_ip":"255.255.255.255","server_ip":"2600:1015:b002::"},{"sessions":1,"decoded_as":"HTTP","log_id": 1, "recv_time":"111", "client_ip":"192.168.0.2","server_ip":"2600:1015:b002::"},{"sessions":1,"decoded_as":"DNS","log_id": 1, "recv_time":"111", "client_ip":"192.168.0.2","server_ip":"2600:1015:b002::"},{"sessions":1,"decoded_as":"DNS","log_id": 1, "recv_time":"111", "client_ip":"192.168.0.2","server_ip":"2600:1015:b002::"}]'
+ interval.per.row: 1s # 可选
+ repeat.count: 1 # 可选
+ format: json
+ json.ignore.parse.errors: false
+sinks:
+ collect_sink:
+ type: collect
+ properties:
+ format: json
+splits:
+ test_split:
+ type: split
+ rules:
+ - name: table_processor
+ expression: event.decoded_as == 'HTTP'
+ - name: pre_etl_processor
+ expression: event.decoded_as == 'DNS'
+
+postprocessing_pipelines:
+ pre_etl_processor: # [object] Processing Pipeline
+ type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl
+ remove_fields: [fields,tags]
+ output_fields:
+ functions: # [array of object] Function List
+
+ - function: FLATTEN
+ lookup_fields: [ fields,tags ]
+ output_fields: [ ]
+ parameters:
+ #prefix: ""
+ depth: 3
+ # delimiter: "."
+
+ - function: RENAME
+ lookup_fields: [ '' ]
+ output_fields: [ '' ]
+ filter:
+ parameters:
+ # parent_fields: [tags]
+ # rename_fields:
+ # tags: tags
+ rename_expression: key =string.replace_all(key,'tags.','');key =string.replace_all(key,'fields.','');return key;
+
+
+ - function: UNIX_TIMESTAMP_CONVERTER
+ lookup_fields: [ timestamp_ms ]
+ output_fields: [ recv_time ]
+ parameters:
+ precision: seconds
+ interval: 300
+ #
+
+ aggregate_processor:
+ type: aggregate
+ group_by_fields: [decoded_as]
+ window_type: tumbling_processing_time # tumbling_event_time,sliding_processing_time,sliding_event_time
+ window_size: 5
+ window_timestamp_field: test_time
+ functions:
+ - function: NUMBER_SUM
+ lookup_fields: [ sessions ]
+
+ table_processor:
+ type: table
+ functions:
+ - function: JSON_UNROLL
+ lookup_fields: [ encapsulation ]
+ output_fields: [ new_name ]
+
+application: # [object] Application Configuration
+ env: # [object] Environment Variables
+ name: groot-stream-job # [string] Job Name
+ pipeline:
+ object-reuse: true # [boolean] Object Reuse, default is false
+ topology: # [array of object] Node List. It will be used build data flow for job dag graph.
+ - name: inline_source # [string] Node Name, must be unique. It will be used as the name of the corresponding Flink operator. eg. kafka_source the processor type as SOURCE.
+ parallelism: 1 # [number] Operator-Level Parallelism.
+ downstream: [test_split,collect_sink]
+ - name: test_split
+ parallelism: 1
+ downstream: [ table_processor,pre_etl_processor ]
+ - name: pre_etl_processor
+ parallelism: 1
+ downstream: [ collect_sink ]
+ - name: table_processor
+ parallelism: 1
+ downstream: [ collect_sink ]
+ - name: collect_sink
+ parallelism: 1
+
+