diff options
| author | doufenghu <[email protected]> | 2024-07-13 17:21:53 +0800 |
|---|---|---|
| committer | doufenghu <[email protected]> | 2024-07-13 17:21:53 +0800 |
| commit | e2196956bdc8a9737a5bbacf8a20823020936b55 (patch) | |
| tree | 115fec75a114e47f76c84999382a3be44ea49c90 | |
| parent | 321907759e968741690d691f43d1527a2b32fc4b (diff) | |
[Improve][Test] IT (integration test) has become optional, and it is no longer executed by default during the mvn compile and deploy process. In the job template for processor and filter, describe the implementation type based on identifiers.
40 files changed, 1103 insertions, 789 deletions
@@ -59,6 +59,7 @@ Run the following Maven command to build the project modules and Skip Tests: ```shell ./mvnw clean install -DskipTests ``` + ### Deploying #### 1.Download the release package Download the latest release package from the [Releases](https://git.mesalab.cn/galaxy/platform/groot-stream/-/releases). diff --git a/config/grootstream.yaml b/config/grootstream.yaml index 31c2ae2..e01fda3 100644 --- a/config/grootstream.yaml +++ b/config/grootstream.yaml @@ -16,5 +16,3 @@ grootstream: hos.bucket.name.traffic_file: traffic_file_bucket hos.bucket.name.troubleshooting_file: troubleshooting_file_bucket scheduler.knowledge_base.update.interval.minutes: 5 - hadoop.dfs.namenodes: 192.168.44.12 - hadoop.dfs.replication: 1 diff --git a/config/template/grootstream_job_template.yaml b/config/template/grootstream_job_template.yaml index 8318b8b..9f64abe 100644 --- a/config/template/grootstream_job_template.yaml +++ b/config/template/grootstream_job_template.yaml @@ -128,7 +128,7 @@ sources: # [object] Define connector source # filters: # [object] Define filter operator filter_operator: # [object] AviatorFilter operator name, must be unique. - type: com.geedgenetworks.core.filter.AviatorFilter + type: aviator # [string] Filter Type properties: expression: event.server_ip != '12.12.12.12' # [string] Aviator expression, it return true or false. @@ -139,7 +139,7 @@ filters: # [object] Define filter operator # preprocessing_pipelines: # [object] Define Processors for preprocessing pipelines. preprocessor: # [object] Define projection processor name, must be unique. - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection # [string] Processor Type functions: # [array of object] Define UDFs - function: DROP # [string] Define DROP function for filter event lookup_fields: [] @@ -152,7 +152,7 @@ preprocessing_pipelines: # [object] Define Processors for preprocessing pipeline # processing_pipelines: # [object] Define Processors for processing pipelines. processor: # [object] Define projection processor name, must be unique. - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection # [string] Processor Type remove_fields: output_fields: functions: # [array of object] Function List @@ -277,7 +277,7 @@ processing_pipelines: # [object] Define Processors for processing pipelines. output_fields: [ asn_list ] metrics_processor: # [object] metrics processing Pipeline - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection output_fields: properties: key: value @@ -325,7 +325,7 @@ processing_pipelines: # [object] Define Processors for processing pipelines. postprocessing_pipelines: # [object] Define Processors for postprocessing pipelines. postprocessor: # [object] Define projection processor name, must be unique. - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection remove_fields: [log_id, device_tag, dup_traffic_flag] # diff --git a/config/template/mock_schema/session_record_mock_desc.json b/config/template/mock_schema/session_record_mock_desc.json index b8d8205..c8c4acf 100644 --- a/config/template/mock_schema/session_record_mock_desc.json +++ b/config/template/mock_schema/session_record_mock_desc.json @@ -12,8 +12,8 @@ { "name": "sled_ip", "type": "IPv4", - "start": "92.168.40.2", - "end": "92.168.40.100" + "start": "192.168.40.2", + "end": "192.168.40.100" }, { "name": "tcp_handshake_latency_ms", @@ -99,7 +99,7 @@ "type": "Union", "unionFields": [ { - "weight": 10, + "weight": 8, "fields": [ { "name": "direction", @@ -151,7 +151,7 @@ ] }, { - "weight": 5, + "weight": 2, "fields": [ { "name": "direction", diff --git a/docs/connector/config-encryption-decryption.md b/docs/connector/config-encryption-decryption.md index 230e37e..3146569 100644 --- a/docs/connector/config-encryption-decryption.md +++ b/docs/connector/config-encryption-decryption.md @@ -1,11 +1,13 @@ # Config File Encryption And Decryption ## Introduction -In production environments, sensitive configuration items such as passwords are required to be encrypted and cannot be stored in plain text. + +In production environments, sensitive configuration items such as passwords are required to be encrypted and cannot be stored in plain text. ## How to use + Groot Stream default support base64 and AES encryption and decryption. - + Base64 encryption support encrypt the following parameters: - username - password @@ -25,131 +27,136 @@ AES encryption support encrypt the following parameters: Next, I'll show how to quickly use groot-stream's own `aes` encryption: 1. Add a new option `shade.identifier` in env block of config file, this option indicate what the encryption method that you want to use, in this example, we should add `shade.identifier = aes` in config as the following shown: - ```yaml - sources: # [object] Define connector source - inline_source: - type: inline - properties: - data: '{"recv_time": 1705565615, "tcp_rtt_ms":128,"decoded_as":"HTTP", "http_version":"http1","http_request_line":"GET / HTTP/1.1","http_host":"www.ct.cn","http_url":"www.ct.cn/","http_user_agent":"curl/8.0.1","http_status_code":200,"http_response_line":"HTTP/1.1 200 OK","http_response_content_type":"text/html; charset=UTF-8","http_response_latency_ms":31,"http_session_duration_ms":5451,"in_src_mac":"ba:bb:a7:3c:67:1c","in_dest_mac":"86:dd:7a:8f:ae:e2","out_src_mac":"86:dd:7a:8f:ae:e2","out_dest_mac":"ba:bb:a7:3c:67:1c","tcp_client_isn":678677906,"tcp_server_isn":1006700307,"address_type":4,"client_ip":"192.11.22.22","server_ip":"8.8.8.8","client_port":42751,"server_port":80,"in_link_id":65535,"out_link_id":65535,"start_timestamp_ms":1703646546127,"end_timestamp_ms":1703646551702,"duration_ms":5575,"sent_pkts":97,"sent_bytes":5892,"received_pkts":250,"received_bytes":333931,"tcp_c2s_ip_fragments":0,"tcp_s2c_ip_fragments":0,"tcp_c2s_rtx_pkts":0,"tcp_c2s_rtx_bytes":0,"tcp_s2c_rtx_pkts":0,"tcp_s2c_rtx_bytes":0,"tcp_c2s_o3_pkts":0,"tcp_s2c_o3_pkts":0,"tcp_c2s_lost_bytes":0,"tcp_s2c_lost_bytes":0,"flags":26418,"flags_identify_info":[100,1,100,60,150,100,1,2],"app_transition":"http.1111.test_1_1","decoded_as":"HTTP","server_fqdn":"www.ct.cn","app":"test_1_1","decoded_path":"ETHERNET.IPv4.TCP.http","fqdn_category_list":[1767],"t_vsys_id":1,"vsys_id":1,"session_id":290538039798223400,"tcp_handshake_latency_ms":41,"client_os_desc":"Windows","server_os_desc":"Linux","data_center":"center-xxg-tsgx","device_group":"group-xxg-tsgx","device_tag":"{\"tags\":[{\"tag\":\"data_center\",\"value\":\"center-xxg-tsgx\"},{\"tag\":\"device_group\",\"value\":\"group-xxg-tsgx\"}]}","device_id":"9800165603247024","sled_ip":"192.168.40.39","dup_traffic_flag":0}' - format: json - json.ignore.parse.errors: false - - sinks: - clickhouse_sink: - type: clickhouse - properties: - host: 192.168.44.12:9001 - table: tsg_galaxy_v3.inline_source_test_local - batch.size: 10 - batch.interval: 1s - connection.user: default - connection.password: 123456 - - application: - env: - name: example-inline-to-clickhouse - parallelism: 3 - shade.identifier: aes - pipeline: - object-reuse: true - topology: - - name: inline_source - downstream: [ clickhouse_sink ] - - name: clickhouse_sink - downstream: [] - ``` + + ```yaml + sources: # [object] Define connector source + inline_source: + type: inline + properties: + data: '{"recv_time": 1705565615, "tcp_rtt_ms":128,"decoded_as":"HTTP", "http_version":"http1","http_request_line":"GET / HTTP/1.1","http_host":"www.ct.cn","http_url":"www.ct.cn/","http_user_agent":"curl/8.0.1","http_status_code":200,"http_response_line":"HTTP/1.1 200 OK","http_response_content_type":"text/html; charset=UTF-8","http_response_latency_ms":31,"http_session_duration_ms":5451,"in_src_mac":"ba:bb:a7:3c:67:1c","in_dest_mac":"86:dd:7a:8f:ae:e2","out_src_mac":"86:dd:7a:8f:ae:e2","out_dest_mac":"ba:bb:a7:3c:67:1c","tcp_client_isn":678677906,"tcp_server_isn":1006700307,"address_type":4,"client_ip":"192.11.22.22","server_ip":"8.8.8.8","client_port":42751,"server_port":80,"in_link_id":65535,"out_link_id":65535,"start_timestamp_ms":1703646546127,"end_timestamp_ms":1703646551702,"duration_ms":5575,"sent_pkts":97,"sent_bytes":5892,"received_pkts":250,"received_bytes":333931,"tcp_c2s_ip_fragments":0,"tcp_s2c_ip_fragments":0,"tcp_c2s_rtx_pkts":0,"tcp_c2s_rtx_bytes":0,"tcp_s2c_rtx_pkts":0,"tcp_s2c_rtx_bytes":0,"tcp_c2s_o3_pkts":0,"tcp_s2c_o3_pkts":0,"tcp_c2s_lost_bytes":0,"tcp_s2c_lost_bytes":0,"flags":26418,"flags_identify_info":[100,1,100,60,150,100,1,2],"app_transition":"http.1111.test_1_1","decoded_as":"HTTP","server_fqdn":"www.ct.cn","app":"test_1_1","decoded_path":"ETHERNET.IPv4.TCP.http","fqdn_category_list":[1767],"t_vsys_id":1,"vsys_id":1,"session_id":290538039798223400,"tcp_handshake_latency_ms":41,"client_os_desc":"Windows","server_os_desc":"Linux","data_center":"center-xxg-tsgx","device_group":"group-xxg-tsgx","device_tag":"{\"tags\":[{\"tag\":\"data_center\",\"value\":\"center-xxg-tsgx\"},{\"tag\":\"device_group\",\"value\":\"group-xxg-tsgx\"}]}","device_id":"9800165603247024","sled_ip":"192.168.40.39","dup_traffic_flag":0}' + format: json + json.ignore.parse.errors: false + + sinks: + clickhouse_sink: + type: clickhouse + properties: + host: 192.168.44.12:9001 + table: tsg_galaxy_v3.inline_source_test_local + batch.size: 10 + batch.interval: 1s + connection.user: default + connection.password: 123456 + + application: + env: + name: example-inline-to-clickhouse + parallelism: 3 + shade.identifier: aes + pipeline: + object-reuse: true + topology: + - name: inline_source + downstream: [ clickhouse_sink ] + - name: clickhouse_sink + downstream: [] + ``` 2. Using the `encrypt` command to encrypt the configuration file: - ```shell - ${GROOTSTREAM_HOME}/bin/start.sh -c config/inline_to_clickhouse.yaml --encrypt - ``` + + ```shell + ${GROOTSTREAM_HOME}/bin/start.sh -c config/inline_to_clickhouse.yaml --encrypt + ``` 3. Then you can see the encrypted configuration file in the terminal: -The clickhouse_sink's connection.user and connection.password has been encrypted. We will using encrypted content replace the original content. - ```log - 2024-01-30 19:11:55,066 INFO com.geedgenetworks.bootstrap.command.ConfEncryptCommand [main] - Encrypt config: - { - "application" : { - "env" : { - "name" : "example-inline-to-clickhouse", - "parallelism" : 3, - "pipeline" : { - "object-reuse" : true - }, - "shade.identifier" : "aes", - "topology" : [ - { - "downstream" : [ - "clickhouse_sink" - ], - "name" : "inline_source" - }, - { - "downstream" : [], - "name" : "clickhouse_sink" - } - ] - } - }, - "sinks" : { - "clickhouse_sink" : { - "properties" : { - "batch.interval" : "1s", - "batch.size" : 10, - "connection.password" : "d7598fa3b27a65b54940eb5aec5c853f", - "connection.user" : "e54c9568586180eede1506eecf3574e9", - "host" : "192.168.44.12:9001", - "table" : "tsg_galaxy_v3.inline_source_test_local" - }, - "type" : "clickhouse" - } - }, - "sources" : { - "inline_source" : { - "properties" : { - "data" : "{\"recv_time\": 1705565615, \"tcp_rtt_ms\":128,\"decoded_as\":\"HTTP\", \"http_version\":\"http1\",\"http_request_line\":\"GET / HTTP/1.1\",\"http_host\":\"www.ct.cn\",\"http_url\":\"www.ct.cn/\",\"http_user_agent\":\"curl/8.0.1\",\"http_status_code\":200,\"http_response_line\":\"HTTP/1.1 200 OK\",\"http_response_content_type\":\"text/html; charset=UTF-8\",\"http_response_latency_ms\":31,\"http_session_duration_ms\":5451,\"in_src_mac\":\"ba:bb:a7:3c:67:1c\",\"in_dest_mac\":\"86:dd:7a:8f:ae:e2\",\"out_src_mac\":\"86:dd:7a:8f:ae:e2\",\"out_dest_mac\":\"ba:bb:a7:3c:67:1c\",\"tcp_client_isn\":678677906,\"tcp_server_isn\":1006700307,\"address_type\":4,\"client_ip\":\"192.11.22.22\",\"server_ip\":\"8.8.8.8\",\"client_port\":42751,\"server_port\":80,\"in_link_id\":65535,\"out_link_id\":65535,\"start_timestamp_ms\":1703646546127,\"end_timestamp_ms\":1703646551702,\"duration_ms\":5575,\"sent_pkts\":97,\"sent_bytes\":5892,\"received_pkts\":250,\"received_bytes\":333931,\"tcp_c2s_ip_fragments\":0,\"tcp_s2c_ip_fragments\":0,\"tcp_c2s_rtx_pkts\":0,\"tcp_c2s_rtx_bytes\":0,\"tcp_s2c_rtx_pkts\":0,\"tcp_s2c_rtx_bytes\":0,\"tcp_c2s_o3_pkts\":0,\"tcp_s2c_o3_pkts\":0,\"tcp_c2s_lost_bytes\":0,\"tcp_s2c_lost_bytes\":0,\"flags\":26418,\"flags_identify_info\":[100,1,100,60,150,100,1,2],\"app_transition\":\"http.1111.test_1_1\",\"decoded_as\":\"HTTP\",\"server_fqdn\":\"www.ct.cn\",\"app\":\"test_1_1\",\"decoded_path\":\"ETHERNET.IPv4.TCP.http\",\"fqdn_category_list\":[1767],\"t_vsys_id\":1,\"vsys_id\":1,\"session_id\":290538039798223400,\"tcp_handshake_latency_ms\":41,\"client_os_desc\":\"Windows\",\"server_os_desc\":\"Linux\",\"data_center\":\"center-xxg-tsgx\",\"device_group\":\"group-xxg-tsgx\",\"device_tag\":\"{\\\"tags\\\":[{\\\"tag\\\":\\\"data_center\\\",\\\"value\\\":\\\"center-xxg-tsgx\\\"},{\\\"tag\\\":\\\"device_group\\\",\\\"value\\\":\\\"group-xxg-tsgx\\\"}]}\",\"device_id\":\"9800165603247024\",\"sled_ip\":\"192.168.40.39\",\"dup_traffic_flag\":0}", - "format" : "json", - "json.ignore.parse.errors" : false - }, - "type" : "inline" - } - } - } - ``` + The clickhouse_sink's connection.user and connection.password has been encrypted. We will using encrypted content replace the original content. + + ```log + 2024-01-30 19:11:55,066 INFO com.geedgenetworks.bootstrap.command.ConfEncryptCommand [main] - Encrypt config: + { + "application" : { + "env" : { + "name" : "example-inline-to-clickhouse", + "parallelism" : 3, + "pipeline" : { + "object-reuse" : true + }, + "shade.identifier" : "aes", + "topology" : [ + { + "downstream" : [ + "clickhouse_sink" + ], + "name" : "inline_source" + }, + { + "downstream" : [], + "name" : "clickhouse_sink" + } + ] + } + }, + "sinks" : { + "clickhouse_sink" : { + "properties" : { + "batch.interval" : "1s", + "batch.size" : 10, + "connection.password" : "d7598fa3b27a65b54940eb5aec5c853f", + "connection.user" : "e54c9568586180eede1506eecf3574e9", + "host" : "192.168.44.12:9001", + "table" : "tsg_galaxy_v3.inline_source_test_local" + }, + "type" : "clickhouse" + } + }, + "sources" : { + "inline_source" : { + "properties" : { + "data" : "{\"recv_time\": 1705565615, \"tcp_rtt_ms\":128,\"decoded_as\":\"HTTP\", \"http_version\":\"http1\",\"http_request_line\":\"GET / HTTP/1.1\",\"http_host\":\"www.ct.cn\",\"http_url\":\"www.ct.cn/\",\"http_user_agent\":\"curl/8.0.1\",\"http_status_code\":200,\"http_response_line\":\"HTTP/1.1 200 OK\",\"http_response_content_type\":\"text/html; charset=UTF-8\",\"http_response_latency_ms\":31,\"http_session_duration_ms\":5451,\"in_src_mac\":\"ba:bb:a7:3c:67:1c\",\"in_dest_mac\":\"86:dd:7a:8f:ae:e2\",\"out_src_mac\":\"86:dd:7a:8f:ae:e2\",\"out_dest_mac\":\"ba:bb:a7:3c:67:1c\",\"tcp_client_isn\":678677906,\"tcp_server_isn\":1006700307,\"address_type\":4,\"client_ip\":\"192.11.22.22\",\"server_ip\":\"8.8.8.8\",\"client_port\":42751,\"server_port\":80,\"in_link_id\":65535,\"out_link_id\":65535,\"start_timestamp_ms\":1703646546127,\"end_timestamp_ms\":1703646551702,\"duration_ms\":5575,\"sent_pkts\":97,\"sent_bytes\":5892,\"received_pkts\":250,\"received_bytes\":333931,\"tcp_c2s_ip_fragments\":0,\"tcp_s2c_ip_fragments\":0,\"tcp_c2s_rtx_pkts\":0,\"tcp_c2s_rtx_bytes\":0,\"tcp_s2c_rtx_pkts\":0,\"tcp_s2c_rtx_bytes\":0,\"tcp_c2s_o3_pkts\":0,\"tcp_s2c_o3_pkts\":0,\"tcp_c2s_lost_bytes\":0,\"tcp_s2c_lost_bytes\":0,\"flags\":26418,\"flags_identify_info\":[100,1,100,60,150,100,1,2],\"app_transition\":\"http.1111.test_1_1\",\"decoded_as\":\"HTTP\",\"server_fqdn\":\"www.ct.cn\",\"app\":\"test_1_1\",\"decoded_path\":\"ETHERNET.IPv4.TCP.http\",\"fqdn_category_list\":[1767],\"t_vsys_id\":1,\"vsys_id\":1,\"session_id\":290538039798223400,\"tcp_handshake_latency_ms\":41,\"client_os_desc\":\"Windows\",\"server_os_desc\":\"Linux\",\"data_center\":\"center-xxg-tsgx\",\"device_group\":\"group-xxg-tsgx\",\"device_tag\":\"{\\\"tags\\\":[{\\\"tag\\\":\\\"data_center\\\",\\\"value\\\":\\\"center-xxg-tsgx\\\"},{\\\"tag\\\":\\\"device_group\\\",\\\"value\\\":\\\"group-xxg-tsgx\\\"}]}\",\"device_id\":\"9800165603247024\",\"sled_ip\":\"192.168.40.39\",\"dup_traffic_flag\":0}", + "format" : "json", + "json.ignore.parse.errors" : false + }, + "type" : "inline" + } + } + } + ``` 4. Of course, you can also decrypt the encrypted configuration file, just execute the following command: - ```shell - ${GROOTSTREAM_HOME}/bin/start.sh -c config/inline_to_clickhouse.yaml --decrypt - ``` + + ```shell + ${GROOTSTREAM_HOME}/bin/start.sh -c config/inline_to_clickhouse.yaml --decrypt + ``` ## How to implement user-defined encryption and decryption -1. Create a new class and implement interface `ConfigShade`, this interface has the following methods: - ```java - public interface ConfigShade { - /** - * The unique identifier of the current interface, used it to select the correct {@link - * ConfigShade} - */ - String getIdentifier(); - - /** - * Encrypt the content - * - * @param content The content to encrypt - */ - String encrypt(String content); - - /** - * Decrypt the content - * - * @param content The content to decrypt - */ - String decrypt(String content); - - /** To expand the options that user want to encrypt */ - default String[] sensitiveOptions() { - return new String[0]; - } - } - ``` -2. Add `com.geedgenetworks.common.config.ConfigShade` in `resources/META-INF/services` +1. Create a new class and implement interface `ConfigShade`, this interface has the following methods: + + ```java + public interface ConfigShade { + /** + * The unique identifier of the current interface, used it to select the correct {@link + * ConfigShade} + */ + String getIdentifier(); + + /** + * Encrypt the content + * + * @param content The content to encrypt + */ + String encrypt(String content); + + /** + * Decrypt the content + * + * @param content The content to decrypt + */ + String decrypt(String content); + + /** To expand the options that user want to encrypt */ + default String[] sensitiveOptions() { + return new String[0]; + } + } + ``` +2. Add `com.geedgenetworks.common.config.ConfigShade` in `resources/META-INF/services` 3. Change the option `shade.identifier` to the value that you defined in `ConfigShade#getIdentifier`of you config file. diff --git a/docs/connector/connector.md b/docs/connector/connector.md index 943b03c..56d3242 100644 --- a/docs/connector/connector.md +++ b/docs/connector/connector.md @@ -1,4 +1,5 @@ # Source Connector + Source Connector contains some common core features, and each source connector supports them to varying degrees. ## Common Source Options @@ -18,21 +19,25 @@ sources: ${prop_key}: ${prop_value} ``` -| Name | Type | Required | Default | Description | -|-------------|---------------|----------|---------|--------------------------------------------------------------------------------------------------------------------------| -| type | String | Yes | (none) | The type of the source connector. The `SourceTableFactory` will use this value as identifier to create source connector. | -| schema | Map | No | (none) | The source table schema, config through fields or local_file or url. | -| properties | Map of String | Yes | (none) | The source connector customize properties, more details see the [Source](source) documentation. | +| Name | Type | Required | Default | Description | +|------------|---------------|----------|---------|--------------------------------------------------------------------------------------------------------------------------| +| type | String | Yes | (none) | The type of the source connector. The `SourceTableFactory` will use this value as identifier to create source connector. | +| schema | Map | No | (none) | The source table schema, config through fields or local_file or url. | +| properties | Map of String | Yes | (none) | The source connector customize properties, more details see the [Source](source) documentation. | ## Schema Field Projection -The source connector supports reading only specified fields from the data source. For example `KafkaSource` will read all content from topic and then use `fields` to filter unnecessary columns. + +The source connector supports reading only specified fields from the data source. For example `KafkaSource` will read all content from topic and then use `fields` to filter unnecessary columns. The Schema Structure refer to [Schema Structure](../user-guide.md#schema-structure). ## Schema Config + Schema can be configured through fields or local_file or url. If not set schema, all fields(Map<String, Object>) will be output. And local_file and url only support Avro schema format. More details see the [Avro Schema](https://avro.apache.org/docs/1.11.1/specification/). ### Fields + It can be configured through array or sql style. It is recommended to use array style, which is more readable. + ```yaml schema: # array style @@ -50,100 +55,118 @@ schema: ``` ### Local File + To retrieve the schema from a local file using its absolute path. + > Ensures that the file path is accessible to all nodes in your Flink cluster. -```yaml -schema: - # by array - fields: - local_file: "/path/to/schema.json" -``` +> +> ```yaml +> schema: +> # by array +> fields: +> local_file: "/path/to/schema.json" +> ``` ### URL + Some connectors support periodically fetching and updating the schema from a URL, such as the `ClickHouse Sink`. + ```yaml schema: # by array fields: url: "https://localhost:8080/schema.json" ``` + ## Mock Data Type + The mock data type is used to define the template of the mock data. -| Mock Type | Parameter | Result Type | Default | Description | +| Mock Type | Parameter | Result Type | Default | Description | |-----------------------------------------|-------------|-----------------------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------| | **[Number](#Number)** | - | **int/bigint/double** | - | **Randomly generate a number.** | | - | min | number | 0 | The minimum value (include). | | - | max | number | int32.max | The maximum value (exclusive). | -| - | options | array of number | (none) | The optional values. If set, the random value will be selected from the options and `start` and `end` will be ignored. | -| - | random | boolean | true | Default is random mode. If set to false, the value will be generated in order. | -| **[Sequence](#Sequence)** | - | **bigint** | - | **Generate a sequence number based on a specific step value .** | +| - | options | array of number | (none) | The optional values. If set, the random value will be selected from the options and `start` and `end` will be ignored. | +| - | random | boolean | true | Default is random mode. If set to false, the value will be generated in order. | +| **[Sequence](#Sequence)** | - | **bigint** | - | **Generate a sequence number based on a specific step value .** | | - | start | bigint | 0 | The first number in the sequence (include). | -| - | step | bigint | 1 | The number to add to each subsequent value. | -| **[UniqueSequence](#UniqueSequence)** | - | **bigint** | - | **Generate a global unique sequence number.** | +| - | step | bigint | 1 | The number to add to each subsequent value. | +| **[UniqueSequence](#UniqueSequence)** | - | **bigint** | - | **Generate a global unique sequence number.** | | - | start | bigint | 0 | The first number in the sequence (include). | -| **[String](#String)** | - | string | - | **Randomly generate a string.** | -| - | regex | string | [a-zA-Z]{0,5} | The regular expression. | -| - | options | array of string | (none) | The optional values. If set, the random value will be selected from the options and `regex` will be ignored. | +| **[String](#String)** | - | string | - | **Randomly generate a string.** | +| - | regex | string | [a-zA-Z]{0,5} | The regular expression. | +| - | options | array of string | (none) | The optional values. If set, the random value will be selected from the options and `regex` will be ignored. | | - | random | boolean | true | Default is random mode. If set to false, the options value will be generated in order. | -| **[Timestamp](#Timestamp)** | - | **bigint** | - | **Generate a unix timestamp in milliseconds or seconds.** | -| - | unit | string | second | The unit of the timestamp. The optional values are `second`, `millis`. | -| **[FormatTimestamp](#FormatTimestamp)** | - | **string** | - | **Generate a formatted timestamp.** | -| - | format | string | yyyy-MM-dd HH:mm:ss | The format to output. | -| - | utc | boolean | false | Default is local time. If set to true, the time will be converted to UTC time. | -| **[IPv4](#IPv4)** | - | **string** | - | **Randomly generate a IPv4 address.** | -| - | start | string | 0.0.0.0 | The minimum value of the IPv4 address(include). | -| - | end | string | 255.255.255.255 | The maximum value of the IPv4 address(include). | -| **[Expression](#Expression)** | - | string | - | **Use library [Datafaker](https://www.datafaker.net/documentation/expressions/) expressions to generate fake data.** | -| - | expression | string | (none) | The datafaker expression used #{expression}. | +| **[Timestamp](#Timestamp)** | - | **bigint** | - | **Generate a unix timestamp in milliseconds or seconds.** | +| - | unit | string | second | The unit of the timestamp. The optional values are `second`, `millis`. | +| **[FormatTimestamp](#FormatTimestamp)** | - | **string** | - | **Generate a formatted timestamp.** | +| - | format | string | yyyy-MM-dd HH:mm:ss | The format to output. | +| - | utc | boolean | false | Default is local time. If set to true, the time will be converted to UTC time. | +| **[IPv4](#IPv4)** | - | **string** | - | **Randomly generate a IPv4 address.** | +| - | start | string | 0.0.0.0 | The minimum value of the IPv4 address(include). | +| - | end | string | 255.255.255.255 | The maximum value of the IPv4 address(include). | +| **[Expression](#Expression)** | - | string | - | **Use library [Datafaker](https://www.datafaker.net/documentation/expressions/) expressions to generate fake data.** | +| - | expression | string | (none) | The datafaker expression used #{expression}. | | **[Eval](#Eval)** | - | **string** | - | **Use AviatorScript value expression to generate data.** | -| - | expression | string | (none) | Support basic arithmetic operations and function calls. More details sess [AviatorScript](https://www.yuque.com/boyan-avfmj/aviatorscript). | -| **[Object](#Object)** | - | **struct/object** | - | **Generate a object data structure. It used to define the nested structure of the mock data.** | -| - | fields | array of object | (none) | The fields of the object. | -| **[Union](#Union)** | - | - | - | **Generate a union data structure with multiple mock data type fields.** | -| - | unionFields | array of object | (none) | The fields of the object. | +| - | expression | string | (none) | Support basic arithmetic operations and function calls. More details sess [AviatorScript](https://www.yuque.com/boyan-avfmj/aviatorscript). | +| **[Object](#Object)** | - | **struct/object** | - | **Generate a object data structure. It used to define the nested structure of the mock data.** | +| - | fields | array of object | (none) | The fields of the object. | +| **[Union](#Union)** | - | - | - | **Generate a union data structure with multiple mock data type fields.** | +| - | unionFields | array of object | (none) | The fields of the object. | | - | - fields | - array of object | (none) | | -| - | - weight | - int | 0 | The weight of the generated object. | +| - | - weight | - int | 0 | The weight of the generated object. | | | random | boolean | true | Default is random mode. If set to false, the options value will be generated in order. | ### Common Parameters Mock data type supports some common parameters. -| Parameter | Type | Default | Description | +| Parameter | Type | Default | Description | |---------------------|---------|---------|----------------------------------------------------------------------------------------| -| [nullRate](#String) | double | 1 | Null value rate. The value range is [0, 1]. If set to 0.1, the null value rate is 10%. | -| [array](#String) | boolean | false | Array flag. If set to true, the value will be generated as an array. | -| arrayLenMin | int | 0 | The minimum length of the array(include). `array` flag must be set to true. | +| [nullRate](#String) | double | 1 | Null value rate. The value range is [0, 1]. If set to 0.1, the null value rate is 10%. | +| [array](#String) | boolean | false | Array flag. If set to true, the value will be generated as an array. | +| arrayLenMin | int | 0 | The minimum length of the array(include). `array` flag must be set to true. | | arrayLenMax | int | 5 | The maximum length of the array(include). `array` flag must be set to true. | - ### Number + - Randomly generate a integer number between 0 and 10000. + ```json {"name":"int_random","type":"Number","min":0,"max":10000} ``` + - Generate a integer number between 0 and 10000, and the value will be generated in order. + ```json {"name":"int_inc","type":"Number","min":0,"max":10000,"random":false} ``` + - Randomly generate a integer number from 20, 22, 25, 30. + ```json {"name":"int_options","type":"Number","options":[20,22,25,30]} ``` + - randomly generate a double number between 0 and 10000. + ```json {"name":"double_random","type":"Number","min":0.0,"max":10000.0} ``` + ### Sequence - Generate a sequence number starting from 0 and incrementing by 2. + ```json {"name":"bigint_sequence","type":"Sequence","start":0,"step":2} ``` + ### UniqueSequence - Generate a global unique sequence number starting from 0. + ```json {"name":"id","type":"UniqueSequence","start":0} ``` @@ -151,14 +174,19 @@ Mock data type supports some common parameters. ### String - Randomly generate s string with a length between 0 and 5. And set null value rate is 10%. + ```json {"name":"str_regex","type":"String","regex":"[a-z]{5,10}","nullRate":0.1} -``` +``` + - Randomly generate a string from "a", "b", "c", "d". + ```json {"name":"str_options","type":"String","options":["a","b","c","d"]} ``` + - Randomly generate a array of string. The length of the array is between 1 and 3. + ```json {"name":"array_str","type":"String","regex":"[a-z]{5,10}","array":true,"arrayLenMin":1,"arrayLenMax":3} ``` @@ -166,22 +194,29 @@ Mock data type supports some common parameters. ### Timestamp - Generate a current Unix timestamp in milliseconds. + ```json {"name":"timestamp_ms","type":"Timestamp","unit":"millis"} ``` + ### FormatTimestamp - Generate a formatted timestamp string using format `yyyy-MM-dd HH:mm:ss`. + ```json {"name":"timestamp_str","type":"FormatTimestamp","format":"yyyy-MM-dd HH:mm:ss"} ``` + - Generate a formatted timestamp string using format `yyyy-MM-dd HH:mm:ss.SSS`. + ```json {"name":"timestamp_str","type":"FormatTimestamp","format":"yyyy-MM-dd HH:mm:ss.SSS"} ``` ### IPv4 + - Generate a IPv4 address between 192.168.20.1 and 192.168.20.255. + ```json {"name":"ip","type":"IPv4","start":"192.168.20.1","end":"192.168.20.255"} ``` @@ -189,23 +224,33 @@ Mock data type supports some common parameters. ### Expression - Generate a fake email address. + ```json {"name":"emailAddress","type":"Expression","expression":"#{internet.emailAddress}"} ``` + - Generate a fake domain name. + ```json {"name":"domain","type":"Expression","expression":"#{internet.domainName}"} ``` + - Generate a fake IPv6 address. + ```json {"name":"ipv6","type":"Expression","expression":"#{internet.ipV6Address}"} ``` + - Generate a fake phone number. + ```json {"name":"phoneNumber","type":"Expression","expression":"#{phoneNumber.phoneNumber}"} ``` + ### Eval + - Generate a value by using AviatorScript expression. Commonly used for arithmetic operations. + ```json {"name": "bytes", "type": "Eval", "expression": "in_bytes + out_bytes"} ``` @@ -213,16 +258,21 @@ Mock data type supports some common parameters. ### Object - Generate a object data structure. + ```json {"name":"object","type":"Object","fields":[{"name":"str","type":"String","regex":"[a-z]{5,10}","nullRate":0.1},{"name":"cate","type":"String","options":["a","b","c"]}]} ``` + output: + ```json {"object": {"str":"abcde","cate":"a"}} ``` ### Union + - Generate a union mock data type fields. Generate object_id and item_id fields. When object_id is 10, item_id is randomly generated from 1, 2, 3, 4, 5. When object_id is 20, item_id is randomly generated from 6, 7. The first object generates 5/7 of the total, and the second object generates 2/7 of the total. + ```json { "name": "unionFields", @@ -266,6 +316,7 @@ output: ``` # Sink Connector + The Sink Connector contains some common core features, and each sink connector supports these features to varying degrees. ## Common Sink Options @@ -283,9 +334,9 @@ sinks: ${prop_key}: ${prop_value} ``` -| Name | Type | Required | Default | Description | -|--------------|----------------|----------|---------|--------------------------------------------------------------------------------------------------------------------| -| type | String | Yes | (none) | The type of the sink connector. The `SinkTableFactory` will use this value as identifier to create sink connector. | -| schema | Map | No | (none) | The sink table schema, config through fields or local_file or url. | -| properties | Map of String | Yes | (none) | The sink connector customize properties, more details see the [Sink](sink) documentation. | +| Name | Type | Required | Default | Description | +|------------|---------------|----------|---------|--------------------------------------------------------------------------------------------------------------------| +| type | String | Yes | (none) | The type of the sink connector. The `SinkTableFactory` will use this value as identifier to create sink connector. | +| schema | Map | No | (none) | The sink table schema, config through fields or local_file or url. | +| properties | Map of String | Yes | (none) | The sink connector customize properties, more details see the [Sink](sink) documentation. | diff --git a/docs/connector/formats/json.md b/docs/connector/formats/json.md index 01aecd7..2402810 100644 --- a/docs/connector/formats/json.md +++ b/docs/connector/formats/json.md @@ -1,21 +1,26 @@ # JSON + > Format JSON -## Description -Event serialization and deserialization format. +> +> ## Description +> +> Event serialization and deserialization format. -| Name | Supported Versions | Maven | +| Name | Supported Versions | Maven | |-------------|--------------------|----------------------------------------------------------------------------------------------------------------------------| | Format JSON | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-json/) | ## Format Options -| Name | Type | Required | Default | Description | -|---------------------------|----------|----------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------| -| format | String | Yes | (none) | Specify what format to use, here should be 'json'. | -| json.ignore.parse.errors | Boolean | No | false | Skip the parsing error or throw an exception. If set to true, the parsing error will be ignored. If set to false, the parsing error will be thrown. | +| Name | Type | Required | Default | Description | +|--------------------------|---------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| format | String | Yes | (none) | Specify what format to use, here should be 'json'. | +| json.ignore.parse.errors | Boolean | No | false | Skip the parsing error or throw an exception. If set to true, the parsing error will be ignored. If set to false, the parsing error will be thrown. | # How to use + ## Inline uses example + ```json { "recv_time": 1705565615, @@ -135,8 +140,3 @@ application: ``` - - - - - diff --git a/docs/connector/formats/msgpack.md b/docs/connector/formats/msgpack.md index 1b603a6..997d691 100644 --- a/docs/connector/formats/msgpack.md +++ b/docs/connector/formats/msgpack.md @@ -1,21 +1,27 @@ # MessagePack + > Format MessagePack -## Description -MessagePack is a binary serialization format. If you need a fast and compact alternative of JSON, MessagePack is your friend. For example, a small integer can be encoded in a single byte, and short strings only need a single byte prefix + the original byte array. MessagePack implementation is already available in various languages (See also the list in http://msgpack.org) and works as a universal data format. +> +> ## Description +> +> MessagePack is a binary serialization format. If you need a fast and compact alternative of JSON, MessagePack is your friend. For example, a small integer can be encoded in a single byte, and short strings only need a single byte prefix + the original byte array. MessagePack implementation is already available in various languages (See also the list in http://msgpack.org) and works as a universal data format. -| Name | Supported Versions | Maven | -|---------------------|--------------------|-----------------------------------------------------------------------------------------------------------------------------------| -| Format MessagePack | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-msgpack/) | +| Name | Supported Versions | Maven | +|--------------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------| +| Format MessagePack | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-msgpack/) | ## Format Options -| Name | Type | Required | Default | Description | -|---------------------------|----------|----------|---------|---------------------------------------------------------| -| format | String | Yes | (none) | Specify what format to use, here should be 'msgpack'. | +| Name | Type | Required | Default | Description | +|--------|--------|----------|---------|-------------------------------------------------------| +| format | String | Yes | (none) | Specify what format to use, here should be 'msgpack'. | # How to use + ## Inline uses example + data: + ```json { "log_id": 1, @@ -55,8 +61,3 @@ application: ``` - - - - - diff --git a/docs/connector/formats/protobuf.md b/docs/connector/formats/protobuf.md index 18f86c8..2dfb65e 100644 --- a/docs/connector/formats/protobuf.md +++ b/docs/connector/formats/protobuf.md @@ -1,42 +1,47 @@ # Protobuf + > Format protobuf -## Description -Protocol buffers are Google’s language-neutral, platform-neutral, extensible mechanism for serializing structured data – think XML, but smaller, faster, and simpler. -It is very popular in Streaming Data Pipeline. Now support protobuf format in source connector. +> +> ## Description +> +> Protocol buffers are Google’s language-neutral, platform-neutral, extensible mechanism for serializing structured data – think XML, but smaller, faster, and simpler. +> It is very popular in Streaming Data Pipeline. Now support protobuf format in source connector. -| Name | Supported Versions | Maven | +| Name | Supported Versions | Maven | |-----------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------| | Format Protobuf | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-protobuf/) | ## Format Options + > Ensures that the file path is accessible to all nodes in your Flink cluster. -| Name | Type | Required | Default | Description | -|-------------------------------|----------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| format | String | Yes | (none) | Specify what format to use, here should be 'protobuf'. | -| protobuf.descriptor.file.path | String | Yes | (none) | The descriptor file path. | -| protobuf.message.name | String | Yes | (none) | The protobuf messageName to look for in descriptor file. | -| protobuf.ignore.parse.errors | Boolean | No | false | Protobuf ignore parse errors, otherwise will throw exception. | -| protobuf.emit.default.values | Boolean | No | false | If true, default values will be emitted for missing fields. It is not recommended, because it will cause performance degradation. About basic data type, it is suggested to use `optional` instead of `required`. | +| Name | Type | Required | Default | Description | +|-------------------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| format | String | Yes | (none) | Specify what format to use, here should be 'protobuf'. | +| protobuf.descriptor.file.path | String | Yes | (none) | The descriptor file path. | +| protobuf.message.name | String | Yes | (none) | The protobuf messageName to look for in descriptor file. | +| protobuf.ignore.parse.errors | Boolean | No | false | Protobuf ignore parse errors, otherwise will throw exception. | +| protobuf.emit.default.values | Boolean | No | false | If true, default values will be emitted for missing fields. It is not recommended, because it will cause performance degradation. About basic data type, it is suggested to use `optional` instead of `required`. | ## Data Type Mapping -| Protobuf Data Type | Data Type | Description | -|-------------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------| -| int32 / uint32 / sint32 / fixed32 / sfixed32 | int | Protobuf data type is recommended to use `int32 / sint32`. The data type also support `int / bigint / float / dobule`. | -| int64 / uint64 / sint64 / fixed64 / sfixed64 | bigint | Protobuf data type is recommended to use `int64 / sint64`. The data type also support `int / bigint / float / dobule`. | -| float | float | Protobuf data type is recommended to use `double`. The data type also support `int / bigint / float / dobule`. | -| double | double | Protobuf data type is recommended to use `double`. The data type also support `int / bigint / float / dobule`. | -| bool | boolean | Protobuf data type is recommended to use `int32`. The data type also support `boolean / int(0:false,1:true) `. | -| enum | int | Protobuf data type is recommended to use `int32`. The data type also support `int`. | -| string | string | In data serialization support all data type converted to `String`. | -| bytes | binary | - | -| message | struct | - | -| repeated | array | - | - +| Protobuf Data Type | Data Type | Description | +|----------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------| +| int32 / uint32 / sint32 / fixed32 / sfixed32 | int | Protobuf data type is recommended to use `int32 / sint32`. The data type also support `int / bigint / float / dobule`. | +| int64 / uint64 / sint64 / fixed64 / sfixed64 | bigint | Protobuf data type is recommended to use `int64 / sint64`. The data type also support `int / bigint / float / dobule`. | +| float | float | Protobuf data type is recommended to use `double`. The data type also support `int / bigint / float / dobule`. | +| double | double | Protobuf data type is recommended to use `double`. The data type also support `int / bigint / float / dobule`. | +| bool | boolean | Protobuf data type is recommended to use `int32`. The data type also support `boolean / int(0:false,1:true) `. | +| enum | int | Protobuf data type is recommended to use `int32`. The data type also support `int`. | +| string | string | In data serialization support all data type converted to `String`. | +| bytes | binary | - | +| message | struct | - | +| repeated | array | - | # How to use + ## protobuf uses example + ```protobuf syntax = "proto3"; @@ -272,7 +277,9 @@ message SessionRecord { string tunnel_endpoint_b_desc = 223; } ``` + Build protobuf file to binary descriptor file. Only support `proto3` syntax. If data type has null value, need add `optional` keyword. We recommend add `optional` keyword for int and double etc. + ```shell protoc --descriptor_set_out=session_record_test.desc session_record_test.proto ``` @@ -307,5 +314,3 @@ application: downstream: [] ``` - - diff --git a/docs/connector/formats/raw.md b/docs/connector/formats/raw.md index ae91fe9..853ac79 100644 --- a/docs/connector/formats/raw.md +++ b/docs/connector/formats/raw.md @@ -1,19 +1,23 @@ # Raw + > Format Raw -## Description -The Raw format allows to read and write raw (byte based) values as a single column. +> +> ## Description +> +> The Raw format allows to read and write raw (byte based) values as a single column. -| Name | Supported Versions | Maven | -|--------------|--------------------|---------------------------------------------------------------------------------------------------------------------------| -| Format Raw | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-raw/) | +| Name | Supported Versions | Maven | +|------------|--------------------|---------------------------------------------------------------------------------------------------------------------------| +| Format Raw | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/format-raw/) | ## Format Options -| Name | Type | Required | Default | Description | -|---------------------------|----------|----------|---------|---------------------------------------------------| -| format | String | Yes | (none) | Specify what format to use, here should be 'raw'. | +| Name | Type | Required | Default | Description | +|--------|--------|----------|---------|---------------------------------------------------| +| format | String | Yes | (none) | Specify what format to use, here should be 'raw'. | # How to use + ## Inline uses example ```yaml @@ -46,8 +50,3 @@ application: ``` - - - - - diff --git a/docs/connector/sink/clickhouse.md b/docs/connector/sink/clickhouse.md index 5256fd7..72536bd 100644 --- a/docs/connector/sink/clickhouse.md +++ b/docs/connector/sink/clickhouse.md @@ -1,7 +1,10 @@ # ClickHouse + > ClickHouse sink connector -## Description -Sink connector for ClickHouse, write data to ClickHouse. You need to know following concepts before using ClickHouse connector. +> +> ## Description +> +> Sink connector for ClickHouse, write data to ClickHouse. You need to know following concepts before using ClickHouse connector. > 1. ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). > 2. The Sink Table must be created before using the ClickHouse connector. > 3. When writing data to sink table, you don't need to specify its schema, because the connector will automatically query current table's schema information using `DESCRIBE TABLE` command. @@ -10,10 +13,9 @@ Sink connector for ClickHouse, write data to ClickHouse. You need to know follow - [ ] [Rate Limiting](../../connector/connector.md) - ## Data Type Mapping -| Event Data Type | Clickhouse Data Type | +| Event Data Type | Clickhouse Data Type | |-----------------|------------------------------------------------------------------------------------| | STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | | INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | @@ -27,18 +29,17 @@ Sink connector for ClickHouse, write data to ClickHouse. You need to know follow | MAP | Map | | Byte[] | Binary | - ## Sink Options + In order to use the ClickHouse connector, the following dependencies are required. They can be download by Nexus Maven Repository. -| Datasource | Supported Versions | Maven | +| Datasource | Supported Versions | Maven | |------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------------| | ClickHouse | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/connector-clickhouse/) | - ClickHouse sink custom properties. If properties belongs to ClickHouse JDBC Config, you can use `connection.` prefix to set. -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |---------------------|----------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | (none) | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"`. | | connection.database | String | Yes | (none) | The `ClickHouse` database. | @@ -51,6 +52,7 @@ ClickHouse sink custom properties. If properties belongs to ClickHouse JDBC Conf | connection.config | Map | No | (none) | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | ## Example + This example read data of inline test source and write to ClickHouse table `test`. ```yaml diff --git a/docs/connector/sink/kafka.md b/docs/connector/sink/kafka.md index 3b9f3c4..716a179 100644 --- a/docs/connector/sink/kafka.md +++ b/docs/connector/sink/kafka.md @@ -1,37 +1,43 @@ # Kafka + > Kafka sink connector -## Description -Sink connector for Apache Kafka. Write data to Kafka topic. More details about producer configs can be found [here](https://docs.confluent.io/platform/current/installation/configuration/producer-configs.htmls). +> +> ## Description +> +> Sink connector for Apache Kafka. Write data to Kafka topic. More details about producer configs can be found [here](https://docs.confluent.io/platform/current/installation/configuration/producer-configs.htmls). ## Key Features - [x] [Rate Limiting](../../connector/connector.md) ## Sink Options + In order to use the Kafka connector, the following dependencies are required. They can be download by Nexus Maven Repository. -| Datasource | Supported Versions | Maven | +| Datasource | Supported Versions | Maven | |------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------| | Kafka | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/connector-kafka/) | Kafka sink custom properties. if properties belongs to Kafka Producer Config, you can use `kafka.` prefix to set. -| Name | Type | Required | Default | Description | -|--------------------------------------|---------|----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| topic | String | Yes | (none) | Topic name is required. It used to write data to kafka. | -| kafka.bootstrap.servers | String | Yes | (none) | A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. This list should be in the form `host1:port1,host2:port2,...`. | -| log.failures.only | Boolean | No | true | Defines whether the producer should fail on errors, or only log them. If this is set to true, then exceptions will be only logged, if set to false, exceptions will be eventually thrown and cause the streaming program to fail (and enter recovery). | -| format | String | No | json | Data format. The default value is `json`. The Optional values are `json`, `protobuf`. | -| [format].config | / | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | -| rate.limiting.strategy | String | No | (none) | The rate limiting strategy to use. The Optional values are `none`, `sliding_window`. | -| rate.limiting.window.size | Integer | No | 5 | The window size of the rate limiting. For example, limit rate less than 10Mbps in 5 seconds time interval. | -| rate.limiting.limit.rate | String | No | 10Mbps | A maximum rate of traffic that can be transmitted over a network or between networks. The units of the bytes rate are Mbps, Kbps,and bps. For example, 10Mbps, 100Kbps, 1000bps. | -| rate.limiting.block.duration | String | No | 5min | If the rate limit is exceeded, the data will be blocked for the specified duration. The units of the duration are seconds, minutes, and hours. For example, 10s, 1m, 1h. | -| rate.limiting.block.reset.duration | String | No | 30s | The time interval for resetting the rate limit. The units of the duration are seconds, minutes, and hours. For example, 10s, 1m, 1h. | -| kafka.config | / | No | (none) | Kafka producer properties. Please refer to [Kafka Producer Config](https://kafka.apache.org/documentation/#producerconfigs) for details. | +| Name | Type | Required | Default | Description | +|------------------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | Yes | (none) | Topic name is required. It used to write data to kafka. | +| kafka.bootstrap.servers | String | Yes | (none) | A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. This list should be in the form `host1:port1,host2:port2,...`. | +| log.failures.only | Boolean | No | true | Defines whether the producer should fail on errors, or only log them. If this is set to true, then exceptions will be only logged, if set to false, exceptions will be eventually thrown and cause the streaming program to fail (and enter recovery). | +| format | String | No | json | Data format. The default value is `json`. The Optional values are `json`, `protobuf`. | +| [format].config | / | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | +| rate.limiting.strategy | String | No | (none) | The rate limiting strategy to use. The Optional values are `none`, `sliding_window`. | +| rate.limiting.window.size | Integer | No | 5 | The window size of the rate limiting. For example, limit rate less than 10Mbps in 5 seconds time interval. | +| rate.limiting.limit.rate | String | No | 10Mbps | A maximum rate of traffic that can be transmitted over a network or between networks. The units of the bytes rate are Mbps, Kbps,and bps. For example, 10Mbps, 100Kbps, 1000bps. | +| rate.limiting.block.duration | String | No | 5min | If the rate limit is exceeded, the data will be blocked for the specified duration. The units of the duration are seconds, minutes, and hours. For example, 10s, 1m, 1h. | +| rate.limiting.block.reset.duration | String | No | 30s | The time interval for resetting the rate limit. The units of the duration are seconds, minutes, and hours. For example, 10s, 1m, 1h. | +| kafka.config | / | No | (none) | Kafka producer properties. Please refer to [Kafka Producer Config](https://kafka.apache.org/documentation/#producerconfigs) for details. | ## Example + This example read data of inline test source and write to kafka topic `SESSION-RECORD-TEST`. + ```yaml sources: # [object] Define connector source inline_source: @@ -93,3 +99,4 @@ application: # [object] Define job configuration - name: connector_kafka downstream: [] ``` + diff --git a/docs/connector/sink/print.md b/docs/connector/sink/print.md index b76379d..15eef72 100644 --- a/docs/connector/sink/print.md +++ b/docs/connector/sink/print.md @@ -1,19 +1,25 @@ # Print + > Print sink connector -## Description -Print sink connector is used to print data to console. It is useful for testing. +> +> ## Description +> +> Print sink connector is used to print data to console. It is useful for testing. ## Sink Options + Print sink custom properties. -| Name | Type | Required | Default | Description | -|-----------------|----------|----------|---------|-----------------------------------------------------------------------------------| -| format | String | Yes | (none) | Data format. The Optional values are `json`, `protobuf`. | -| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | -| mode | String | No | stdout | Print mode. The Optional values are `stdout`, `log_info`, `log_warn`, `null`. | +| Name | Type | Required | Default | Description | +|-----------------|--------|----------|---------|-----------------------------------------------------------------------------------| +| format | String | Yes | (none) | Data format. The Optional values are `json`, `protobuf`. | +| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | +| mode | String | No | stdout | Print mode. The Optional values are `stdout`, `log_info`, `log_warn`, `null`. | ## Example + This example read data of inline test source and print to console. + ```yaml sources: inline_source: @@ -43,4 +49,5 @@ application: downstream: [print_sink] - name: print_sink downstream: [] -```
\ No newline at end of file +``` + diff --git a/docs/connector/source/file.md b/docs/connector/source/file.md index 6858fd6..bdbf74e 100644 --- a/docs/connector/source/file.md +++ b/docs/connector/source/file.md @@ -10,7 +10,7 @@ File source connector is used to generate data from a text file(local file or hd File source custom properties. -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |---------------------------|---------|----------|---------|---------------------------------------------------------------------------------------------------| | path | String | Yes | (none) | File path, support local path or hdfs path. Example: ./logs/logs.json, hdfs://ns1/test/logs.json. | | format | String | Yes | (none) | Data format. The Optional values are `json`, `csv`. | @@ -23,6 +23,7 @@ File source custom properties. ## Example This example read data of file test source and print to console. + > Ensures that the file path is accessible to all nodes in your Flink cluster. ```yaml @@ -54,14 +55,3 @@ application: downstream: [ ] ``` -put file to hdfs: -```sh -# maka dir -hadoop fs -mkdir hdfs://ns1/test - -# put local file to hdfs -hadoop fs -put logs.json hdfs://ns1/test - -# list hdfs dir -hadoop fs -ls logs.json hdfs://ns1/test -``` diff --git a/docs/connector/source/inline.md b/docs/connector/source/inline.md index b510c20..0603f88 100644 --- a/docs/connector/source/inline.md +++ b/docs/connector/source/inline.md @@ -1,18 +1,23 @@ # Inline + > Inline source connector -## Description -Inline source connector is used to generate data from inline configuration. It is useful for testing. -## Source Options -Inline source custom properties. The data will be parsed to Map<String, Object> by the specified format. - -| Name | Type | Required | Default | Description | -|------------------|-----------|----------|---------|--------------------------------------------------------------------------------------------------------------------------| -| data | String | Yes | (none) | Testing data, support json and protobuf format. If data is json format , it can be parsed multiple events by json array. | -| format | String | Yes | (none) | Data format. The Optional values are `json`, `protobuf`. | -| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | -| interval.per.row | Duration | No | 1s | Interval time in seconds between each row. If it is 0, it will be generated as fast as possible. | -| repeat.count | Integer | No | -1 | Repeat count. If it is -1, it will be generated infinitely. | -| type | String | No | string | Data codec type. The Optional values are `string(UTF-8)`, `hex`, `base64`. | +> +> ## Description +> +> Inline source connector is used to generate data from inline configuration. It is useful for testing. +> +> ## Source Options +> +> Inline source custom properties. The data will be parsed to Map<String, Object> by the specified format. + +| Name | Type | Required | Default | Description | +|------------------|----------|----------|---------|--------------------------------------------------------------------------------------------------------------------------| +| data | String | Yes | (none) | Testing data, support json and protobuf format. If data is json format , it can be parsed multiple events by json array. | +| format | String | Yes | (none) | Data format. The Optional values are `json`, `protobuf`. | +| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | +| interval.per.row | Duration | No | 1s | Interval time in seconds between each row. If it is 0, it will be generated as fast as possible. | +| repeat.count | Integer | No | -1 | Repeat count. If it is -1, it will be generated infinitely. | +| type | String | No | string | Data codec type. The Optional values are `string(UTF-8)`, `hex`, `base64`. | ## Example @@ -48,4 +53,3 @@ application: downstream: [] ``` - diff --git a/docs/connector/source/ipfix.md b/docs/connector/source/ipfix.md index 3a4fbbf..8a766a8 100644 --- a/docs/connector/source/ipfix.md +++ b/docs/connector/source/ipfix.md @@ -1,17 +1,22 @@ # IPFIX + > IPFIX source collector -## Description -IPFIX source collector is used to collect IPFIX data from network devices. It supports UDP protocol. -## Source Options -In order to use the IPFIX connector, the following dependencies are required. They can be download by Nexus Maven Repository. +> +> ## Description +> +> IPFIX source collector is used to collect IPFIX data from network devices. It supports UDP protocol. +> +> ## Source Options +> +> In order to use the IPFIX connector, the following dependencies are required. They can be download by Nexus Maven Repository. -| Datasource | Supported Versions | Maven | +| Datasource | Supported Versions | Maven | |------------|--------------------|------------------------------------------------------------------------------------------------------------------------------------------| | IPFIX | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/connector-ipfix-collector/) | IPFIX source custom properties. -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |-----------------------------------------|---------|----------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| | port.range | String | Yes | (none) | Range of ports such as 3000 -3010 | | max.packet.size | Integer | No | 65535 | The maximum size of packet for UDP | @@ -28,4 +33,5 @@ IPFIX source custom properties. | service.discovery.consul.server.port | Integer | No | (none) | Consul server port. Need specify `ipfix.service.discovery.registry.mode` to `1(consul)` | | service.discovery.consul.token | String | No | (none) | Consul token. Need specify `ipfix.service.discovery.registry.mode` to `1(consul)` | -## Example
\ No newline at end of file +## Example + diff --git a/docs/connector/source/kafka.md b/docs/connector/source/kafka.md index 49871c4..07dff22 100644 --- a/docs/connector/source/kafka.md +++ b/docs/connector/source/kafka.md @@ -1,27 +1,33 @@ # Kafka + > Kafka source connector -## Description -Source connector for Apache Kafka. More details about consumer configs can be found [here](https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html). +> +> ## Description +> +> Source connector for Apache Kafka. More details about consumer configs can be found [here](https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html). ## Source Options + In order to use the Kafka connector, the following dependencies are required. They can be download by Nexus Maven Repository. -| Datasource | Supported Versions | Maven | +| Datasource | Supported Versions | Maven | |------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------| | Kafka | Universal | [Download](http://192.168.40.153:8099/service/local/repositories/platform-release/content/com/geedgenetworks/connector-kafka/) | Kafka source custom properties. if properties belongs to Kafka Consumer Config, you can use `kafka.` prefix to set. -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |-------------------------|--------|----------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | String | Yes | (none) | Topic name(s). It also supports topic list for source by separating topic by semicolon like 'topic-1;topic-2'. | | kafka.bootstrap.servers | String | Yes | (none) | A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. This list should be in the form `host1:port1,host2:port2,...`. | | format | String | No | json | Data format. The default value is `json`. The Optional values are `json`, `protobuf`. | -| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | +| [format].config | Map | No | (none) | Data format properties. Please refer to [Format Options](../formats) for details. | | kafka.config | Map | No | (none) | Kafka consumer properties. Please refer to [Kafka Consumer Config](https://kafka.apache.org/documentation/#consumerconfigs) for details. | ## Example + This example read data of kafka topic `SESSION-RECORD` and print to console. + ```yaml sources: # [object] Define connector source kafka_source: # [object] Kafka source connector name @@ -60,3 +66,4 @@ application: # [object] Define job configuration - name: print_sink downstream: [] ``` + diff --git a/docs/connector/source/mock.md b/docs/connector/source/mock.md index dfd10d9..76c2808 100644 --- a/docs/connector/source/mock.md +++ b/docs/connector/source/mock.md @@ -4,18 +4,18 @@ ## Description -Mock source connector used to randomly generate the number of rows according to the user-defined schema. This connector helps you test the functionality of your system without relying on real data. +Mock source connector used to randomly generate the number of rows according to the user-defined schema. This connector helps you test the functionality of your system without relying on real data. ## Source Options Mock source custom properties. -| Name | Type | Required | Default | Description | -|---------------------|---------|----------|---------|-----------------------------------------------------------------------------------------------------| -| mock.desc.file.path | String | Yes | (none) | The path of the mock data structure file. | -| rows.per.second | Integer | No | 1000 | The number of rows per second that connector generated. | -| number.of.rows | Long | No | -1 | The total number of rows data generated. By default, the source is unbounded. | -| millis.per.row | Long | No | 0 | The interval(mills) between each row. If greater than 0, then `rows.per.second` will be ignored. | +| Name | Type | Required | Default | Description | +|---------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------| +| mock.desc.file.path | String | Yes | (none) | The path of the mock data structure file. | +| rows.per.second | Integer | No | 1000 | The number of rows per second that connector generated. | +| number.of.rows | Long | No | -1 | The total number of rows data generated. By default, the source is unbounded. | +| millis.per.row | Long | No | 0 | The interval(mills) between each row. If greater than 0, then `rows.per.second` will be ignored. | ## Example @@ -48,4 +48,3 @@ application: downstream: [ ] ``` - diff --git a/docs/develop-guide.md b/docs/develop-guide.md index 26d3af0..2742cee 100644 --- a/docs/develop-guide.md +++ b/docs/develop-guide.md @@ -29,26 +29,44 @@ > - Description: The most important part of a commit message is that it should be clear and meaningful. > - JIRA Issue ID: Integrating JIRA is used for issue tracking. -## How to throw an exception in Groot Stream Platform -When throwing an exception with a hint message and ensure that the exception has a smaller scope. You can create a custom exception class that include error code and message parameter in its constructor. For example, if you encounters a checked exception `ClassNotFoundException` while dynamic class loading,a reasonable approach would be to the following: - +## How to write a high quality code + +1. When throwing an exception with a hint message and ensure that the exception has a smaller scope. You can create a custom exception class that include error code and message parameter in its constructor. For example, if you encounters a checked exception `ClassNotFoundException` while dynamic class loading,a reasonable approach would be to the following: + ``` - try { - // Class.forname - } catch (ClassNotFoundException e) { - throw GrootStreamBootstrapException("Missing class or incorrect classpath", e); - } +try { + // Class.forname +} catch (ClassNotFoundException e) { + throw GrootStreamBootstrapException("Missing class or incorrect classpath", e); +} +``` + +2. Before you submit a merge request, you should ensure that the code will not cause any compilation errors, and the code should be formatted correctly. You can use the following commands to package and check the code: + +```shell +# multi threads compile +./mvnw -T 1C clean verify +# single thread compile +./mvnw clean verify +# run all modules integration test. Need set up a Docker environment and compile using a single thread +./mvnw clean verify -DskipIT=false +# check code style +./mvnw clean compile -B -Dskip.spotless=false -e ``` +3. Before submitting a merge request, do a full unit test and integration test locally to ensure that the code is correct. You can use the `groot-examples` module's ability to run the example to verify the correctness of the code. + ## Design Principles 1. Package structure: `com.geedgenetworks.[module].[sub-module]`. `groot-stream` is the parent module, and other modules are dependent on it. 2. Module naming: `groot-[module]`. e.g. `groot-common`, `groot-core`, `groot-connectors`, `groot-bootstrap`, `groot-examples`, etc. -3. For unchecked exception (RuntimeException) within the 'groot-common' module, a global exception handling class named 'GrootRuntimeException' is defined. +3. For unchecked exception (RuntimeException) within the 'groot-common' module, a global exception handling class named 'GrootRuntimeException' is defined. ## Run a job example + All examples are in module `end-to-end-examples`-. You can run the example [ running or debugging a job in IDEA]. e.g. we use `end-to-end-examples/src/main/java/com/geedgenetworks/example/GrootStreamExample.java` as the example, when you produce some sample data in `Inline` and you could see the result in console. as follows: + ```json {"log_id":155652727148914688,"decoded_as":"BASE","recv_time":111,"fqdn_string":"baidu.com","server_ip":"120.233.20.242","additional_field_subdomain":"baidu.com","client_ip":"192.168.0.1"} ``` diff --git a/docs/env-config.md b/docs/env-config.md index e29acb0..7a31494 100644 --- a/docs/env-config.md +++ b/docs/env-config.md @@ -1,12 +1,16 @@ # The Job Environment Configuration + The env configuration includes basic parameters and engine parameters. + ## Basic Parameter ### name + This parameter is used to define the name of the job. In addition, the job name can also be specified in the flink cluster by using the `flink run` command. If is not specified, the default name is `groot-stream-job`. Above three ways to specify the job name, the priority is `flink run` > `name` in the configuration file > default name. ### parallelism + An execution environment defines a default parallelism for all processors, filters, data sources, and data sinks it executes. In addition, the parallelism of a job can be specified on different levels, and the priority is `Operator Level` > `Execution Environment Level` > `Client Level` > `System Level`. Note: The parallelism of a job can be overridden by explicitly configuring the parallelism of a processor, filter, data source, or data sink in the configuration file. @@ -16,35 +20,44 @@ Note: The parallelism of a job can be overridden by explicitly configuring the p - System Level: The parallelism of a job can be specified by using the `flink-conf.yaml` file. ### execution.buffer-timeout + The maximum time frequency (milliseconds) for the flushing of the output buffers. If is not specified, the default value is `100`. You can set directly in Flink's parameter `fink.execution.buffer-timeout` to override the value in the configuration file. - A positive value triggers flushing periodically by that interval - 0 triggers flushing after every record thus minimizing latency - -1 ms triggers flushing only when the output buffer is full thus maximizing throughput + ### execution.runtime-mode + This parameter is used to define the runtime mode of the job, the default value is `STREAMING`. If you want to run the job in batch mode, you can set `execution.runtime-mode = "BATCH"`. ### shade.identifier + Specify the method of encryption, if you didn't have the requirement for encrypting or decrypting sensitive information in the configuration file, this option can be ignored. For more details, you can refer to the documentation [config-encryption-decryption](connector/config-encryption-decryption.md) ### pipeline.object-reuse + This parameter is used to enable/disable object reuse for the execution of the job. If it is not specified, the default value is `false`. ### jars + Third-party jars can be loaded via `jars`, by using `jars="file://local/jar1.jar;file://local/jar2.jar"`. ### pipeline.jars + Specify a list of jar URLs via `pipeline.jars`, The jars are separated by `;` and will be uploaded to the flink cluster. ### pipeline.classpaths + Specify a list of classpath URLs via `pipeline.classpaths`, The classpaths are separated by `;` and will be added to the classpath of the flink cluster. ## Engine Parameter + You can directly use the flink parameter by prefixing `flink.`, such as `flink.execution.buffer-timeout`, `flink.object-reuse`, etc. More details can be found in the official [flink documentation](https://flink.apache.org/). Of course, you can use groot stream parameter, here are some parameter names corresponding to the names in Flink. -| Groot Stream | Flink | +| Groot Stream | Flink | |----------------------------------------|---------------------------------------------------------------| | execution.buffer-timeout | flink.execution.buffer-timeout | | pipeline.object-reuse | flink.object-reuse | @@ -57,5 +70,3 @@ Of course, you can use groot stream parameter, here are some parameter names cor | execution.restart.delayInterval | flink.restart-strategy.failure-rate.delay | | ... | ... | - - diff --git a/docs/faq.md b/docs/faq.md index 2af4e2b..761f278 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,9 +1,13 @@ ## Maven 3.8.1 blocked mirror for internal repositories + If you are using a Maven Wrapper (mvnw) build project, you may encounter the following error: + ``` maven-default-http-blocker (http://0.0.0.0/): Blocked mirror for repositories: xxx ``` + This is because Maven 3.8.1 has a new feature that blocks all HTTP requests by default. You can disable this feature by adding the following configuration to the Maven settings.xml file: + ```xml <mirrors> <mirror> @@ -16,3 +20,27 @@ This is because Maven 3.8.1 has a new feature that blocks all HTTP requests by d </mirrors> ``` +## Hadoop HDFS commands for beginners + +```sh +# Create a directory in HDFS +hadoop fs -mkdir hdfs://ns1/test_dir +# Put a file into HDFS +hadoop fs -put localfile.txt hdfs://ns1/test_dir +# Copy a file from HDFS to local +hadoop fs -get hdfs://ns1/test_dir/localfile.txt localfile_copy.txt +# Move a file in HDFS +hadoop fs -mv hdfs://ns1/test_dir/localfile.txt hdfs://ns1/test_dir/localfile2.txt +# List files and directories +hadoop fs -ls localfile.txt hdfs://ns1/test_dir +# List files and directories recursively +hadoop fs -ls -R hdfs://ns1/test_dir +# View the content of a file +hadoop fs -cat hdfs://ns1/test_dir/localfile.txt +# Remove a file from HDFS +hadoop fs -rm hdfs://ns1/test_dir/localfile.txt +# Remove a directory from HDFS +hadoop fs -rm -r hdfs://ns1/test_dir + +``` + diff --git a/docs/filter/aviator.md b/docs/filter/aviator.md index 5fe7ed2..54fe24b 100644 --- a/docs/filter/aviator.md +++ b/docs/filter/aviator.md @@ -1,18 +1,24 @@ # Aviator -> Filter data by AviatorFilter. The event whether drop or pass downstream is based on the Aviator expression. -## Description -AviatorScript is a lightweight, high performance scripting language hosted on the JVM. It uses filter operator that uses the Aviator expression to filter data. -More details about AviatorScript can be found at [AviatorScript](https://www.yuque.com/boyan-avfmj/aviatorscript). -## Options -| Name | Type | Required | Default | Description | -|-------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | String | Yes | (none) | The type of the filter operator. Now only support `com.geedgenetworks.core.filter.AviatorFilter`. | -| properties | Map | Yes | (none) | Filter operator properties. | -| -expression | String | Yes | (none) | Based on the filter expression, the event will be passed to downstream if the expression is true, otherwise it will be dropped. Build a filter expression need add prefix `event.`, if you want get event field. | +> Filter data by AviatorFilter. The event whether drop or pass downstream is based on the Aviator expression. +> +> ## Description +> +> AviatorScript is a lightweight, high performance scripting language hosted on the JVM. It uses filter operator that uses the Aviator expression to filter data. +> More details about AviatorScript can be found at [AviatorScript](https://www.yuque.com/boyan-avfmj/aviatorscript). +> + ## Options + +| Name | Type | Required | Default | Description | +|-------------|--------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | String | Yes | (none) | The type of the filter operator. Now only support `com.geedgenetworks.core.filter.AviatorFilter`. | +| properties | Map | Yes | (none) | Filter operator properties. | +| -expression | String | Yes | (none) | Based on the filter expression, the event will be passed to downstream if the expression is true, otherwise it will be dropped. Build a filter expression need add prefix `event.`, if you want get event field. | ## Example + This example read data from inline source and print to console. It will filter the event that `server_ip` is `8.8.8.8` output to console. + ```yaml sources: inline_source: @@ -50,6 +56,3 @@ This example read data from inline source and print to console. It will filter t downstream: [] ``` - - - diff --git a/docs/grootstream-config.md b/docs/grootstream-config.md index 8ccb055..fb902ae 100644 --- a/docs/grootstream-config.md +++ b/docs/grootstream-config.md @@ -1,4 +1,5 @@ # Groot Stream Config + The purpose of this file is to provide a global configuration for the groot-stream server, such as the default configuration of the job. ## Config file structure @@ -18,15 +19,20 @@ grootstream: scheduler.knowledge_base.update.interval.minutes: ${knowledge_base_update_interval_minutes} # Define the interval of the knowledge base file update. ``` + ### Knowledge Base -The knowledge base is a collection of libraries that can be used in the groot-stream job's UDFs. File system type can be specified `local` or `http` mode. If the value is `http`, must be `KB Repository` URL. The library will be dynamically updated according to the `scheduler.knowledge_base.update.interval.minutes` configuration. -| Name | Type | Required | Default | Description | -|:---------|:--------|:---------|:--------|:---------------------------------------------------------------------------| -| name | String | Yes | (none) | The name of the knowledge base, used to [UDF](processor/udf.md) | -| fs_type | String | Yes | (none) | The type of the file system. Enum: local and http. | -| fs_path | String | Yes | (none) | The path of the file system. It can be file directory or http restful api. | -| files | Array | No | (none) | The file list of the knowledge base object. | +The knowledge base is a collection of libraries that can be used in the groot-stream job's UDFs. File system type can be specified `local`, `http` or `hdfs`. +If the value is `http`, must be ` QGW Knowledge Base Repository` URL. The library will be dynamically updated according to the `scheduler.knowledge_base.update.interval.minutes` configuration. +If the value is `local`, the library will be loaded from the local file system. Need to manually upgrade all nodes in the Flink cluster when the library is updated. +If the value is `hdfs`, the library will be loaded from the HDFS file system. More details about hdfs operation can be found in the [HDFS](./faq.md#hadoop-hdfs-commands-for-beginners). + +| Name | Type | Required | Default | Description | +|:--------|:-------|:---------|:--------|:---------------------------------------------------------------------------| +| name | String | Yes | (none) | The name of the knowledge base, used to [UDF](processor/udf.md) | +| fs_type | String | Yes | (none) | The type of the file system. Enum: local, http, hdfs. | +| fs_path | String | Yes | (none) | The path of the file system. It can be file directory or http restful api. | +| files | Array | No | (none) | The file list of the knowledge base object. | ### Define the knowledge base file from a local file @@ -41,9 +47,10 @@ grootstream: files: - asn_builtin.mmdb - asn_user_defined.mmdb -``` +``` ### Define the knowledge base file from a http restful api + Knowledge base(KB) file can be updated dynamically by the Galaxy QGW KB module. Groot Stream Scheduler will periodically fetch the KB file metadata and determine whether UDF needs to be updated. ```yaml @@ -54,4 +61,19 @@ grootstream: fs_path: http://127.0.0.1:9999/v1/knowledge_base flies: - f9f6bc91-2142-4673-8249-e097c00fe1ea -```
\ No newline at end of file +``` + +### Define the knowledge base file from a HDFS file system + +> Ensure that the HDFS file system is accessible to all nodes in your Flink cluster. + +```yaml +grootstream: + knowledge_base: + - name: tsg_ip_asn + fs_type: hdfs + fs_path: hdfs://ns1/knowledge_base/ + files: + - asn_builtin.mmdb + - asn_user_defined.mmdb +``` diff --git a/docs/processor/projection-processor.md b/docs/processor/projection-processor.md index 65c7545..bc4b249 100644 --- a/docs/processor/projection-processor.md +++ b/docs/processor/projection-processor.md @@ -1,20 +1,26 @@ # Projection Processor + > Processing pipelines for projection processor + ## Description + Projection processor is used to project the data from source to sink. It can be used to filter, remove, and transform fields. It is a part of the processing pipeline. It can be used in the pre-processing, processing, and post-processing pipeline. Each processor can assemble UDFs(User-defined functions) into a pipeline. Within the pipeline, events are processed by each Function in order, top‑>down. The UDF usage detail can be found in [UDF](udf.md). + ## Options -| name | type | required | default value | -|----------------|---------|----------|----------------------------------------------------------------------------------------------------------------| -| type | String | Yes | The type of the processor, now only support `com.geedgenetworks.core.processor.projection.ProjectionProcessor` | -| output_fields | Array | No | Array of String. The list of fields that need to be kept. Fields not in the list will be removed. | -| remove_fields | Array | No | Array of String. The list of fields that need to be removed. | -| functions | Array | No | Array of Object. The list of functions that need to be applied to the data. | +| name | type | required | default value | +|---------------|--------|----------|----------------------------------------------------------------------------------------------------------------| +| type | String | Yes | The type of the processor, now only support `com.geedgenetworks.core.processor.projection.ProjectionProcessor` | +| output_fields | Array | No | Array of String. The list of fields that need to be kept. Fields not in the list will be removed. | +| remove_fields | Array | No | Array of String. The list of fields that need to be removed. | +| functions | Array | No | Array of Object. The list of functions that need to be applied to the data. | ## Usage Example + This example use projection processor to remove the fields `http_request_line`, `http_response_line`, `http_response_content_type` and using DROP function filter all event that `server_ip` is `4.4.4.4`. + ```yaml sources: inline_source: @@ -64,4 +70,3 @@ application: downstream: [] ``` - diff --git a/docs/processor/udf.md b/docs/processor/udf.md index 6bd87b0..2a705fd 100644 --- a/docs/processor/udf.md +++ b/docs/processor/udf.md @@ -1,6 +1,8 @@ # UDF + > The functions for projection processors. -## Function of content +> + ## Function of content - [Asn Lookup](#asn-lookup) - [Base64 Decode](#base64-decode) @@ -21,26 +23,30 @@ - [Unix Timestamp Converter](#unix-timestamp-converter) ## Description + UDF(User Defined Function) is used to extend the functions of projection processor. The UDF is a part of the processing pipeline. It can be used in the pre-processing pipeline, processing pipeline, and post-processing pipeline. + ## UDF Definition + A UDF includes the following parts: name, event(processing data), context, evaluate function, open function, and close function. - name: Function name, with uppercase words separated by underscores, used for function registration. -- event: The data to be processed. It is organized in a Map<String, Object> structure. +- event: The data to be processed. It is organized in a Map<String, Object> structure. - context: Function context, used to store the state of the function. Including the following parameters: - - `filter`: Filter expression, string type. It is used to filter events that need to processed by the function. The expression is written in Aviator expression language. For example, `event.server_ip == '. - - `lookup_fields`: The fields that need to be used as lookup keys. It is an array of string type. For example, `['server_ip', 'client_ip']`. - - `output_fields`: The fields are used to append the result to the event. It is an array of string type. For example, `['server_ip', 'client_ip']`. If the field already exists in the event, the value will be overwritten. - - `parameters`: Custom parameters. It is a Map<String, Object> type. -- evaluate function: The function to process the event. It is a function that returns a Map<String, Object> type. +- `filter`: Filter expression, string type. It is used to filter events that need to processed by the function. The expression is written in Aviator expression language. For example, `event.server_ip == '. +- `lookup_fields`: The fields that need to be used as lookup keys. It is an array of string type. For example, `['server_ip', 'client_ip']`. +- `output_fields`: The fields are used to append the result to the event. It is an array of string type. For example, `['server_ip', 'client_ip']`. If the field already exists in the event, the value will be overwritten. +- `parameters`: Custom parameters. It is a Map<String, Object> type. +- evaluate function: The function to process the event. It is a function that returns a Map<String, Object> type. - open function: Initialize the resources used by the function. - close function: Release the resources used by the function. - + ### Functions Function define common parameters: `filter`, `lookup_fields`, `output_fields`, `parameters`, and will return a Map<String, Object> value of the event. -``` FUNCTION_NAME(filter, lookup_fields, output_fields[, parameters])``` +``` FUNCTION_NAME(filter, lookup_fields, output_fields[, parameters])``` ### Asn Lookup + Asn lookup function is used to lookup the asn information by ip address. You need to host the `.mmdb` database file from Knowledge Base Repository. ```ASN_LOOKUP(filter, lookup_fields, output_fields[, parameters])``` @@ -48,20 +54,22 @@ Asn lookup function is used to lookup the asn information by ip address. You nee - lookup_fields: required - output_fields: required - parameters: required - - kb_name: required. The name of the knowledge base. - - option: required. Now only support `IP_TO_ASN`. +- kb_name: required. The name of the knowledge base. +- option: required. Now only support `IP_TO_ASN`. Example: + ```yaml - - function: ASN_LOOKUP - lookup_fields: [client_ip] - output_fields: [client_asn] - parameters: - kb_name: tsg_ip_asn - option: IP_TO_ASN +- function: ASN_LOOKUP + lookup_fields: [client_ip] + output_fields: [client_asn] + parameters: + kb_name: tsg_ip_asn + option: IP_TO_ASN ``` ### Base64 Decode + Base64 decode function is used to decode the base64 encoded string. ```BASE64_DECODE_TO_STRING(filter, output_fields[, parameters])``` @@ -69,19 +77,21 @@ Base64 decode function is used to decode the base64 encoded string. - lookup_fields: not required - output_fields: required - parameters: required - - value_field: `<String>` required. - - charset_field:`<String>` optional. Default is `UTF-8`. +- value_field: `<String>` required. +- charset_field:`<String>` optional. Default is `UTF-8`. Example: + ```yaml - - function: BASE64_DECODE_TO_STRING - output_fields: [mail_attachment_name] - parameters: - value_field: mail_attachment_name - charset_field: mail_attachment_name_charset +- function: BASE64_DECODE_TO_STRING + output_fields: [mail_attachment_name] + parameters: + value_field: mail_attachment_name + charset_field: mail_attachment_name_charset ``` ### Base64 Encode + Base64 encode function is commonly used to encode the binary data to base64 string. Especially when that data need to be stored and transferred over media that are designed to deal with text. This encoding helps to ensure that the data remains intact without modification during transport. ```BASE64_ENCODE_TO_STRING(filter, output_fields[, parameters])``` @@ -89,17 +99,19 @@ Base64 encode function is commonly used to encode the binary data to base64 stri - lookup_fields: not required - output_fields: required - parameters: required - - value_field: `<String>` required. +- value_field: `<String>` required. Example: + ```yaml - - function: BASE64_ENCODE_TO_STRING - output_fields: [packet] - parameters: - value_field: packet +- function: BASE64_ENCODE_TO_STRING + output_fields: [packet] + parameters: + value_field: packet ``` ### Current Unix Timestamp + Current unix timestamp function is used to get the current unix timestamp. ```CURRENT_UNIX_TIMESTAMP(output_fields[, parameters])``` @@ -107,17 +119,19 @@ Current unix timestamp function is used to get the current unix timestamp. - lookup_fields: not required - output_fields: required - parameters: optional - - precision: `<String>` optional. Default is `seconds`. Enum: `milliseconds`, `seconds`. +- precision: `<String>` optional. Default is `seconds`. Enum: `milliseconds`, `seconds`. Example: + ```yaml - - function: CURRENT_UNIX_TIMESTAMP - output_fields: [recv_time] - parameters: - precision: seconds +- function: CURRENT_UNIX_TIMESTAMP + output_fields: [recv_time] + parameters: + precision: seconds ``` ### Domain + Domain function is used to extract the domain from the url. ```DOMAIN(filter, lookup_fields, output_fields[, parameters])``` @@ -125,23 +139,25 @@ Domain function is used to extract the domain from the url. - lookup_fields: required. Support more than one fields. All fields will be processed from left to right, and the result will be overwritten if the field processed value is not null. - output_fields: required - parameters: required - - option: `<String>` required. Enum: `TOP_LEVEL_DOMAIN`, `FIRST_SIGNIFICANT_SUBDOMAIN`. +- option: `<String>` required. Enum: `TOP_LEVEL_DOMAIN`, `FIRST_SIGNIFICANT_SUBDOMAIN`. + +#### Option -#### Option - `TOP_LEVEL_DOMAIN` is used to extract the top level domain from the url. For example, `www.abc.com` will be extracted to `com`. - `FIRST_SIGNIFICANT_SUBDOMAIN` is used to extract the first significant subdomain from the url. For example, `www.abc.com` will be extracted to `abc.com`. Example: ```yaml - - function: DOMAIN - lookup_fields: [http_host, ssl_sni, quic_sni] - output_fields: [server_domain] - parameters: - option: FIRST_SIGNIFICANT_SUBDOMAIN +- function: DOMAIN + lookup_fields: [http_host, ssl_sni, quic_sni] + output_fields: [server_domain] + parameters: + option: FIRST_SIGNIFICANT_SUBDOMAIN ``` ### Drop + Drop function is used to filter the event. If the filter expression is true, the event will be dropped. Otherwise, the event will be passed to downstream. ```DROP(filter)``` @@ -151,11 +167,14 @@ Drop function is used to filter the event. If the filter expression is true, the - parameters: not required Example: + ```yaml - - function: DROP - filter: event.server_ip == '4.4.4.4' +- function: DROP + filter: event.server_ip == '4.4.4.4' ``` + ### Eval + Eval function is used to adds or removes fields from events by evaluating an value expression. ```EVAL(filter, output_fields[, parameters])``` @@ -163,25 +182,30 @@ Eval function is used to adds or removes fields from events by evaluating an val - lookup_fields: not required - output_fields: required - parameters: required - - value_expression: `<String>` required. Enter a value expression to set the field’s value – this can be a constant. +- value_expression: `<String>` required. Enter a value expression to set the field’s value – this can be a constant. Example 1: Add a field `ingestion_time` with value `recv_time`: + ```yaml - - function: EVAL - output_fields: [ingestion_time] - parameters: - value_expression: recv_time +- function: EVAL + output_fields: [ingestion_time] + parameters: + value_expression: recv_time ``` + Example 2: If the value of `direction` is `69`, the value of `internal_ip` will be `client_ip`, otherwise the value of `internal_ip` will be `server_ip`. + ```yaml - - function: EVAL - output_fields: [internal_ip] - parameters: - value_expression: 'direction=69 ? client_ip : server_ip' +- function: EVAL + output_fields: [internal_ip] + parameters: + value_expression: 'direction=69 ? client_ip : server_ip' ``` + ### Flatten + Flatten the fields of nested structure to the top level. The new fields name are named using the field name prefixed with the names of the struct fields to reach it, separated by dots as default. ```FLATTEN(filter, lookup_fields, output_fields[, parameters])``` @@ -189,33 +213,36 @@ Flatten the fields of nested structure to the top level. The new fields name are - lookup_fields: optional - output_fields: not required - parameters: optional - - prefix: `<String>` optional. Prefix string for flattened field names. Default is empty. - - depth: `<Integer>` optional. Number representing the nested levels to consider for flattening. Minimum 1. Default is `5`. - - delimiter: `<String>` optional. The string used to join nested keys Default is `.`. - - json_string_keys: `<Array>` optional. The keys of the json string fields. It indicates keys that contain JSON strings and should be parsed and flattened. Default is empty. +- prefix: `<String>` optional. Prefix string for flattened field names. Default is empty. +- depth: `<Integer>` optional. Number representing the nested levels to consider for flattening. Minimum 1. Default is `5`. +- delimiter: `<String>` optional. The string used to join nested keys Default is `.`. +- json_string_keys: `<Array>` optional. The keys of the json string fields. It indicates keys that contain JSON strings and should be parsed and flattened. Default is empty. Example 1: Flatten the nested structure of fields and tags in Metrics. If lookup_fields is empty, flatten all nested structures. ```yaml - - function: FLATTEN - lookup_fields: [tags,fields] +- function: FLATTEN + lookup_fields: [tags,fields] ``` Example 2: Flatten the nested structure of the session record field `encapsulation` (JSON String format), add the prefix `tunnels`, specify the nesting depth as `3`, and use an dot "." as the delimiter. + ```yaml - - function: FLATTEN - lookup_fields: [encapsulation] - parameters: - prefix: tunnels - depth: 3 - delimiter: . - json_string_keys: [encapsulation] +- function: FLATTEN + lookup_fields: [encapsulation] + parameters: + prefix: tunnels + depth: 3 + delimiter: . + json_string_keys: [encapsulation] ``` + Output: + ```json { "tunnels.encapsulation.ipv4.client_ip:": "192.168.11.12", @@ -224,6 +251,7 @@ Output: ``` ### From Unix Timestamp + From unix timestamp function is used to convert the unix timestamp to date time string. The default time zone is UTC+0. ```FROM_UNIX_TIMESTAMP(filter, lookup_fields, output_fields[, parameters])``` @@ -231,22 +259,25 @@ From unix timestamp function is used to convert the unix timestamp to date time - lookup_fields: required - output_fields: required - parameters: optional - - precision: `<String>` optional. Default is `seconds`. Enum: `milliseconds`, `seconds`. +- precision: `<String>` optional. Default is `seconds`. Enum: `milliseconds`, `seconds`. + +#### Precision -#### Precision - `milliseconds` is used to convert the unix timestamp to milliseconds date time string. For example, `1619712000` will be converted to `2021-04-30 00:00:00.000`. - `seconds` is used to convert the unix timestamp to seconds date time string. For example, `1619712000` will be converted to `2021-04-30 00:00:00`. Example: + ```yaml - - function: FROM_UNIX_TIMESTAMP - lookup_fields: [recv_time] - output_fields: [recv_time_string] - parameters: - precision: seconds +- function: FROM_UNIX_TIMESTAMP + lookup_fields: [recv_time] + output_fields: [recv_time_string] + parameters: + precision: seconds ``` ### Generate String Array + Generate string array function is used to merge multiple fields into a string array. The merged field may be a string or a string array. ```GENERATE_STRING_ARRAY(filter, lookup_fields, output_fields)``` @@ -256,12 +287,15 @@ Generate string array function is used to merge multiple fields into a string ar - parameters: not required Example: + ```yaml - - function: GENERATE_STRING_ARRAY - lookup_fields: [http_host, ssl_sni, quic_sni] - output_fields: [server_domains] +- function: GENERATE_STRING_ARRAY + lookup_fields: [http_host, ssl_sni, quic_sni] + output_fields: [server_domains] ``` + ### GeoIP Lookup + GeoIP lookup function is used to lookup the geoip information by ip address. You need to host the `.mmdb` database file from Knowledge Base Repository. ```GEOIP_LOOKUP(filter, lookup_fields, output_fields[, parameters])``` @@ -269,29 +303,31 @@ GeoIP lookup function is used to lookup the geoip information by ip address. You - lookup_fields: required - output_fields: optional - parameters: required - - kb_name: `<String>` required. The name of the knowledge base. - - option: `<String>` required. Enum: `IP_TO_COUNTRY`, `IP_TO_PROVINCE`, `IP_TO_CITY`, `IP_TO_SUBDIVISION_ADDR`, `IP_TO_DETAIL`, `IP_TO_LATLNG`, `IP_TO_PROVIDER`, `IP_TO_JSON`, `IP_TO_OBJECT`. - - geolocation_field_mapping : `<Map<String, String>>` optional. The option is required when the option is `IP_TO_OBJECT`. The mapping of the geolocation fields. The key is the field name of the knowledge base , and the value is the field name of the event. - - COUNTRY: `<String>` optional. - - PROVINCE: `<String>` optional. - - CITY: `<String>` optional. - - LONGITUDE: `<String>` optional. - - LATITUDE: `<String>` optional. - - ISP: `<String>` optional. - - ORGANIZATION: `<String>` optional. - -#### Option - - `IP_TO_COUNTRY` is used to lookup the country or region information by ip address. - - `IP_TO_PROVINCE` is used to lookup the province or state information by ip address. - - `IP_TO_CITY` is used to lookup the city information by ip address. - - `IP_TO_SUBDIVISION_ADDR` is used to lookup the subdivision address information by ip address. - - `IP_TO_DETAIL` is used to lookup the above four levels of information by ip address. It separated by `.`. - - `IP_TO_LATLNG` is used to lookup the latitude and longitude information by ip address. It separated by `,`. - - `IP_TO_PROVIDER` is used to lookup the provider information by ip address. - - `IP_TO_JSON` is used to lookup the above information by ip address. The result is a json string. - - `IP_TO_OBJECT` is used to lookup the above information by ip address. The result is a `LocationResponse` object. - -#### GeoLocation Field Mapping +- kb_name: `<String>` required. The name of the knowledge base. +- option: `<String>` required. Enum: `IP_TO_COUNTRY`, `IP_TO_PROVINCE`, `IP_TO_CITY`, `IP_TO_SUBDIVISION_ADDR`, `IP_TO_DETAIL`, `IP_TO_LATLNG`, `IP_TO_PROVIDER`, `IP_TO_JSON`, `IP_TO_OBJECT`. +- geolocation_field_mapping : `<Map<String, String>>` optional. The option is required when the option is `IP_TO_OBJECT`. The mapping of the geolocation fields. The key is the field name of the knowledge base , and the value is the field name of the event. +- COUNTRY: `<String>` optional. +- PROVINCE: `<String>` optional. +- CITY: `<String>` optional. +- LONGITUDE: `<String>` optional. +- LATITUDE: `<String>` optional. +- ISP: `<String>` optional. +- ORGANIZATION: `<String>` optional. + +#### Option + +- `IP_TO_COUNTRY` is used to lookup the country or region information by ip address. +- `IP_TO_PROVINCE` is used to lookup the province or state information by ip address. +- `IP_TO_CITY` is used to lookup the city information by ip address. +- `IP_TO_SUBDIVISION_ADDR` is used to lookup the subdivision address information by ip address. +- `IP_TO_DETAIL` is used to lookup the above four levels of information by ip address. It separated by `.`. +- `IP_TO_LATLNG` is used to lookup the latitude and longitude information by ip address. It separated by `,`. +- `IP_TO_PROVIDER` is used to lookup the provider information by ip address. +- `IP_TO_JSON` is used to lookup the above information by ip address. The result is a json string. +- `IP_TO_OBJECT` is used to lookup the above information by ip address. The result is a `LocationResponse` object. + +#### GeoLocation Field Mapping + - `COUNTRY` is used to map the country information to the event field. - `PROVINCE` is used to map the province information to the event field. - `CITY` is used to map the city information to the event field. @@ -303,27 +339,29 @@ GeoIP lookup function is used to lookup the geoip information by ip address. You Example: ```yaml - - function: GEOIP_LOOKUP - lookup_fields: [ client_ip ] - output_fields: [ client_geolocation ] - parameters: - kb_name: tsg_ip_location - option: IP_TO_DETAIL +- function: GEOIP_LOOKUP + lookup_fields: [ client_ip ] + output_fields: [ client_geolocation ] + parameters: + kb_name: tsg_ip_location + option: IP_TO_DETAIL ``` + ```yaml - - function: GEOIP_LOOKUP - lookup_fields: [ server_ip ] - output_fields: [] - parameters: - kb_name: tsg_ip_location - option: IP_TO_OBJECT - geolocation_field_mapping: - COUNTRY: server_country - PROVINCE: server_super_administrative_area - CITY: server_administrative_area +- function: GEOIP_LOOKUP + lookup_fields: [ server_ip ] + output_fields: [] + parameters: + kb_name: tsg_ip_location + option: IP_TO_OBJECT + geolocation_field_mapping: + COUNTRY: server_country + PROVINCE: server_super_administrative_area + CITY: server_administrative_area ``` ### JSON Extract + JSON extract function is used to extract the value from json string. ```JSON_EXTRACT(filter, lookup_fields, output_fields[, parameters])``` @@ -331,16 +369,16 @@ JSON extract function is used to extract the value from json string. - lookup_fields: required - output_fields: required - parameters: required - - value_expression: `<String>` required. The json path expression. +- value_expression: `<String>` required. The json path expression. Example: ```yaml - - function: JSON_EXTRACT - lookup_fields: [ device_tag ] - output_fields: [ device_group ] - parameters: - value_expression: $.tags[?(@.tag=='device_group')][0].value +- function: JSON_EXTRACT + lookup_fields: [ device_tag ] + output_fields: [ device_group ] + parameters: + value_expression: $.tags[?(@.tag=='device_group')][0].value ``` ### Path Combine @@ -352,19 +390,20 @@ Path combine function is used to combine the file path. The path value can be co - lookup_fields: required - output_fields: required - parameters: required - - path: `<Array>` required. +- path: `<Array>` required. Example: - + ```yaml - - function: PATH_COMBINE - lookup_fields: [ packet_capture_file ] - output_fields: [ packet_capture_file ] - parameters: - path: [ props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file ] +- function: PATH_COMBINE + lookup_fields: [ packet_capture_file ] + output_fields: [ packet_capture_file ] + parameters: + path: [ props.hos.path, props.hos.bucket.name.traffic_file, packet_capture_file ] ``` ### Rename + Rename function is used to rename or reformat(e.g. by replacing character underscores with dots) the field name. ```RENAME(filter, lookup_fields, output_fields, parameters)``` @@ -372,26 +411,27 @@ Rename function is used to rename or reformat(e.g. by replacing character unders - lookup_fields: not required - output_fields: not required - parameters: required - - parent_fields: `<Array>` optional. Specify fields whose children will inherit the Rename fields and Rename expression operations. - - rename_fields: `Map<String, String>` required. The key is the original field name, and the value is the new field name. - - current_field_name: `<String>` required. The original field name. - - new_field_name: `<String>` required. The new field name. - - rename_expression: `<String>` optional. AviatorScript expression whose returned value will be used to rename fields. +- parent_fields: `<Array>` optional. Specify fields whose children will inherit the Rename fields and Rename expression operations. +- rename_fields: `Map<String, String>` required. The key is the original field name, and the value is the new field name. +- current_field_name: `<String>` required. The original field name. +- new_field_name: `<String>` required. The new field name. +- rename_expression: `<String>` optional. AviatorScript expression whose returned value will be used to rename fields. ``` A single Function can include both rename_fields (to rename specified field names) and rename_expression (to globally rename fields). However, the Rename fields strategy will execute first. ``` + Example 1: Remove the prefix "tags_" from the field names and rename the field "timestamp_ms" to "recv_time_ms". ```yaml - - function: RENAME - - parameters: - rename_fields: - - timestamp_ms: recv_time_ms - rename_expression: key=string.replace_all(key,'tags_',''); return key; - +- function: RENAME +- parameters: + rename_fields: + - timestamp_ms: recv_time_ms + rename_expression: key=string.replace_all(key,'tags_',''); return key; + ``` Example 2: @@ -399,15 +439,15 @@ Example 2: Rename the field `client_ip` to `source_ip`, including the fields under the `encapsulation.ipv4` tunnel. ```yaml - - function: RENAME - - parameters: - parent_fields: [encapsulation.ipv4] - rename_fields: - - client_ip: source_ip - +- function: RENAME +- parameters: + parent_fields: [encapsulation.ipv4] + rename_fields: + - client_ip: source_ip + ``` -Output: `source_ip:192.168.4.1, encapsulation.ipv4.source_ip:192.168.12.12` +Output: `source_ip:192.168.4.1, encapsulation.ipv4.source_ip:192.168.12.12` ### Snowflake ID @@ -422,14 +462,15 @@ Snowflake ID function is used to generate the snowflake id. The snowflake id is - lookup_fields: not required - output_fields: required - parameters: optional - - data_center_id_num: `<Integer>` optional. Default is `0`, range is `0-31`. +- data_center_id_num: `<Integer>` optional. Default is `0`, range is `0-31`. Example: + ```yaml - - function: SNOWFLAKE_ID - output_fields: [log_id] - parameters: - data_center_id_num: 1 +- function: SNOWFLAKE_ID + output_fields: [log_id] + parameters: + data_center_id_num: 1 ``` ### String Joiner @@ -441,41 +482,40 @@ String joiner function joins multiple string fields using a delimiter, prefix, a - lookup_fields: required. Support more than one fields. - output_fields: required - parameters: optional - - delimiter: `<String>` optional. Default is `,`. - - prefix: `<String>` optional. Default is empty string. - - suffix: `<String>` optional. Default is empty string. +- delimiter: `<String>` optional. Default is `,`. +- prefix: `<String>` optional. Default is empty string. +- suffix: `<String>` optional. Default is empty string. Example: + ```yaml - - function: STRING_JOINER - lookup_fields: [http_host, ssl_sni, quic_sni] - output_fields: [server_domains] - parameters: - delimiter: ',' - prefix: '[' - suffix: ']' +- function: STRING_JOINER + lookup_fields: [http_host, ssl_sni, quic_sni] + output_fields: [server_domains] + parameters: + delimiter: ',' + prefix: '[' + suffix: ']' ``` ### Unix Timestamp Converter -Unix timestamp converter function is used to convert the unix timestamp precision. +Unix timestamp converter function is used to convert the unix timestamp precision. ```UNIX_TIMESTAMP_CONVERTER(filter, lookup_fields, output_fields[, parameters])``` - filter: optional - lookup_fields: required - output_fields: required - parameters: required - - precision: `<String>` required. Enum: `milliseconds`, `seconds`, `minutes`. The minutes precision is used to generate Unix timestamp, round it to the minute level, and output it in seconds format. - - Example: +- precision: `<String>` required. Enum: `milliseconds`, `seconds`, `minutes`. The minutes precision is used to generate Unix timestamp, round it to the minute level, and output it in seconds format. +- Example: _`__timestamp` Internal field, from source ingestion time or current unix timestamp. + ```yaml - - function: UNIX_TIMESTAMP_CONVERTER - lookup_fields: [__timestamp] - output_fields: [recv_time] - parameters: - precision: seconds +- function: UNIX_TIMESTAMP_CONVERTER + lookup_fields: [__timestamp] + output_fields: [recv_time] + parameters: + precision: seconds ``` - - - diff --git a/docs/user-guide.md b/docs/user-guide.md index 8e8b00f..9d5b1c7 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -1,9 +1,12 @@ # Introduction - GrootStream is a real-time ETL platform based on the concepts of flow-based programing. Groot provides a template to quickly build a data flow job. It includes sources, filters, processing pipelines and sinks etc. +GrootStream is a real-time ETL platform based on the concepts of flow-based programing. Groot provides a template to quickly build a data flow job. It includes sources, filters, processing pipelines and sinks etc. The main format of the config template file is `yaml`, for more details of this format type you can refer to [YAML-GUIDE](https://yaml.org/spec/1.2/spec.html). + # Job Config + ## Config file structure + ```yaml sources: inline_source: @@ -88,7 +91,9 @@ application: ``` ## Schema Structure + Some sources are not strongly limited schema, so you need use `fields` to define the field name and type. The source can customize the schema. Like `Kafka` `Inline` source etc. + ```yaml Schema: fields: @@ -105,6 +110,7 @@ Schema: - name: decoded_as type: string ``` + `name` The name of the field. `type` The data type of the field. | Data type | Value type in Java | Description | @@ -119,29 +125,35 @@ Schema: | struct | `java.util.Map<String, Object>` | A Map is an object that maps keys to values. The value type includes all types. example: struct<id:int, client_ip:string, data:struct<id:int, name:string>>. | | array | `List<Object>` | A array is a data type that represents a collection of elements. The element type includes all types. example: array<int>, array<struct<id:int, client_ip:string>>. | - ## Sources + Source is used to define where GrootStream needs to ingest data. Multiple sources can be defined in a job. The supported sources are listed in [Source Connectors](connector/source). Each source has its own specific parameters to define how to fetch data, and GrootStream also extracts the properties that each source will use, such as the `topic` and `kafka.bootstrap.servers` of the `Kafka` source. ## Filters + Filter operator is used to define the conditions for filtering data. Multiple filters can be defined in a job. The supported filters are listed in [Filters](filter). Each filter has its own specific parameters to define how to filter data, and GrootStream also extracts the properties that each filter will use, such as the `expression` of the `Aviator` filter. Based on the filter expression, the event will be passed to downstream if the expression is true, otherwise it will be dropped. ## Processing Pipelines + Processing pipelines are used to define the event processing logic of the job. It can be categorized by functionality into stateless and stateful processors. Based processing order, it can be categorized into pre-processing pipeline, processing pipeline and post-processing pipeline. Each processor can assemble `UDFs`(User-defined functions) into a pipeline. The detail of processor is listed in [Processor](processor). ## Sinks + Sink is used to define where GrootStream needs to output data. Multiple sinks can be defined in a job. The supported sinks are listed in [Sink Connectors](connector/sink). Each sink has its own specific parameters to define how to output data, and GrootStream also extracts the properties that each sink will use, such as the `topic` and `kafka.bootstrap.servers` of the `Kafka` sink. ## Application + Used to define some common parameters of the job and the topology of the job. such as the name of the job, the parallelism of the job, etc. The following configuration parameters are supported. ### ENV -Used to define job environment configuration information. For more details, you can refer to the documentation [JobEnvConfig](./env-config.md). +Used to define job environment configuration information. For more details, you can refer to the documentation [JobEnvConfig](./env-config.md). # Command + ## Run a job by CLI + ```bash Usage: start.sh [options] Options: @@ -162,3 +174,5 @@ Options: +``` + diff --git a/groot-common/pom.xml b/groot-common/pom.xml index 8e1f1c8..10e9ed4 100644 --- a/groot-common/pom.xml +++ b/groot-common/pom.xml @@ -9,7 +9,7 @@ </parent> <artifactId>groot-common</artifactId> - <name>Groot : Common </name> + <name>Groot : Common</name> <properties> @@ -67,8 +67,6 @@ <artifactId>guava-retrying</artifactId> </dependency> - - <dependency> <groupId>com.hazelcast</groupId> <artifactId>hazelcast</artifactId> @@ -78,7 +76,6 @@ <artifactId>disruptor</artifactId> </dependency> - <!-- flink --> <dependency> <groupId>org.apache.flink</groupId> diff --git a/groot-common/src/test/java/com/geedgenetworks/common/config/YamlGrootStreamConfigParserTest.java b/groot-common/src/test/java/com/geedgenetworks/common/config/YamlGrootStreamConfigParserTest.java index 80c3690..967ad63 100644 --- a/groot-common/src/test/java/com/geedgenetworks/common/config/YamlGrootStreamConfigParserTest.java +++ b/groot-common/src/test/java/com/geedgenetworks/common/config/YamlGrootStreamConfigParserTest.java @@ -20,9 +20,13 @@ public class YamlGrootStreamConfigParserTest { Assertions.assertNotNull(config); Assertions.assertNotNull(config.getCommonConfig().getKnowledgeBaseConfig()); Assertions.assertTrue(config.getCommonConfig().getKnowledgeBaseConfig().size() > 0); - Assertions.assertTrue(Arrays.asList("http", "hdfs","local").contains(config.getCommonConfig().getKnowledgeBaseConfig().get(0).getFsType())); + Assertions.assertTrue( + Arrays.asList("http", "hdfs", "local") + .contains( + config.getCommonConfig() + .getKnowledgeBaseConfig() + .get(0) + .getFsType())); Assertions.assertTrue(config.getUDFPluginConfig().size() > 0); - } - } diff --git a/groot-examples/end-to-end-example/src/main/java/com/geedgenetworks/example/GrootStreamExample.java b/groot-examples/end-to-end-example/src/main/java/com/geedgenetworks/example/GrootStreamExample.java index d313c9f..2e21e49 100644 --- a/groot-examples/end-to-end-example/src/main/java/com/geedgenetworks/example/GrootStreamExample.java +++ b/groot-examples/end-to-end-example/src/main/java/com/geedgenetworks/example/GrootStreamExample.java @@ -13,7 +13,7 @@ import java.nio.file.Paths; public class GrootStreamExample { public static void main(String[] args) throws FileNotFoundException, URISyntaxException { - String configPath = args.length > 0 ? args[0] : "/examples/object_statistics_mock_to_print.yaml"; + String configPath = args.length > 0 ? args[0] : "/examples/session_record_mock_to_print.yaml"; String configFile = getTestConfigFile(configPath); ExecuteCommandArgs executeCommandArgs = new ExecuteCommandArgs(); executeCommandArgs.setConfigFile(configFile); diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_clickhouse.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_clickhouse.yaml index fba50a2..2241268 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_clickhouse.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_clickhouse.yaml @@ -33,7 +33,7 @@ sources: # [object] Define connector source processing_pipelines: processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection functions: - function: SNOWFLAKE_ID lookup_fields: [ ] diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print.yaml index daf6e32..00f2a7d 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print.yaml @@ -8,13 +8,13 @@ sources: filters: filter_operator: - type: com.geedgenetworks.core.filter.AviatorFilter + type: aviator properties: expression: event.server_ip != '12.12.12.12' processing_pipelines: projection_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection remove_fields: [http_request_line, http_response_line, http_response_content_type] functions: - function: DROP diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_avro_schema.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_avro_schema.yaml index a88e36c..704baee 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_avro_schema.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_avro_schema.yaml @@ -8,13 +8,13 @@ sources: filters: filter_operator: - type: com.geedgenetworks.core.filter.AviatorFilter + type: aviator properties: expression: event.server_ip != '12.12.12.12' processing_pipelines: projection_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection remove_fields: [http_request_line, http_response_line, http_response_content_type] functions: - function: DROP diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml index cfd3917..1e1e13e 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml @@ -30,18 +30,18 @@ sources: filters: filter: - type: com.geedgenetworks.core.filter.AviatorFilter + type: aviator properties: expression: event.decoded_as == 'HTTP' preprocessing_pipelines: transform_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection remove_fields: [client_ip] processing_pipelines: session_record_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection remove_fields: [device_tag] output_fields: [log_id, client_ip, client_geolocation, client_asn, server_domain, server_ip, server_geolocation, server_asn] functions: diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/object_statistics_mock_to_print.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/object_statistics_mock_to_print.yaml index 02d18a1..b787b01 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/object_statistics_mock_to_print.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/object_statistics_mock_to_print.yaml @@ -7,7 +7,7 @@ sources: # [object] Define connector source processing_pipelines: etl_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection functions: - function: UNIX_TIMESTAMP_CONVERTER lookup_fields: [ __timestamp ] diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/session_record_mock_to_print.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/session_record_mock_to_print.yaml index 1c079a7..f115643 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/session_record_mock_to_print.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/session_record_mock_to_print.yaml @@ -7,7 +7,7 @@ sources: # [object] Define connector source processing_pipelines: etl_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection functions: - function: SNOWFLAKE_ID lookup_fields: [''] @@ -115,7 +115,7 @@ application: # [object] Define job configuration object-reuse: true topology: - name: mock_source - downstream: [ etl_processor ] + downstream: [ print_sink ] - name: etl_processor downstream: [ print_sink ] - name: print_sink diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/statistics_rule_mock_to_print.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/statistics_rule_mock_to_print.yaml index d38a855..e3e1dbc 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/statistics_rule_mock_to_print.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/statistics_rule_mock_to_print.yaml @@ -7,7 +7,7 @@ sources: # [object] Define connector source processing_pipelines: etl_processor: - type: com.geedgenetworks.core.processor.projection.ProjectionProcessorImpl + type: projection functions: - function: UNIX_TIMESTAMP_CONVERTER lookup_fields: [ __timestamp ] @@ -49,7 +49,7 @@ sinks: application: # [object] Define job configuration env: - name: mock_to_print + name: statistics_rule_mock_data_to_kafka parallelism: 3 shade.identifier: aes pipeline: diff --git a/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/EnvParameterTest.java b/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/EnvParameterIT.java index 02ce012..aa00d8d 100644 --- a/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/EnvParameterTest.java +++ b/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/EnvParameterIT.java @@ -28,7 +28,7 @@ import static org.awaitility.Awaitility.await; value = {TestContainerId.FLINK_1_17}, type = {}, disabledReason = "only flink adjusts the parameter configuration rules") -public class EnvParameterTest extends TestSuiteBase { +public class EnvParameterIT extends TestSuiteBase { @TestContainerExtension protected final ContainerExtendedFactory extendedFactory = container -> { diff --git a/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/InlineToPrintTest.java b/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/InlineToPrintIT.java index 1e94cdc..d4cdcbd 100644 --- a/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/InlineToPrintTest.java +++ b/groot-tests/test-e2e-base/src/test/java/com/geedgenetworks/test/e2e/base/InlineToPrintIT.java @@ -24,7 +24,7 @@ import static org.awaitility.Awaitility.await; value = {TestContainerId.FLINK_1_17}, type = {}, disabledReason = "only flink adjusts the parameter configuration rules") -public class InlineToPrintTest extends TestSuiteBase { +public class InlineToPrintIT extends TestSuiteBase { @TestTemplate public void testInlineToPrint(AbstractTestFlinkContainer container) throws IOException, InterruptedException { diff --git a/groot-tests/test-e2e-kafka/src/test/java/com/geedgenetworks/test/e2e/kafka/KafkaConnectorTest.java b/groot-tests/test-e2e-kafka/src/test/java/com/geedgenetworks/test/e2e/kafka/KafkaConnectorIT.java index 09f8aca..d01a514 100644 --- a/groot-tests/test-e2e-kafka/src/test/java/com/geedgenetworks/test/e2e/kafka/KafkaConnectorTest.java +++ b/groot-tests/test-e2e-kafka/src/test/java/com/geedgenetworks/test/e2e/kafka/KafkaConnectorIT.java @@ -44,7 +44,7 @@ import static org.awaitility.Awaitility.await; @DisabledOnContainer( value = {TestContainerId.FLINK_1_17}, disabledReason = "Override TestSuiteBase @DisabledOnContainer") -public class KafkaConnectorTest extends TestSuiteBase implements TestResource { +public class KafkaConnectorIT extends TestSuiteBase implements TestResource { private KafkaContainer kafkaContainer; @@ -29,13 +29,16 @@ <maven.compiler.source>${java.version}</maven.compiler.source> <maven.compiler.target>${java.version}</maven.compiler.target> <maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version> + <maven-shade-plugin.version>3.3.0</maven-shade-plugin.version> <maven-compiler-plugin.version>3.6.1</maven-compiler-plugin.version> + <maven-source-plugin.version>3.0.1</maven-source-plugin.version> <maven-helper-plugin.version>3.2.0</maven-helper-plugin.version> <maven-dependency-plugin.version>3.1.1</maven-dependency-plugin.version> <flatten-maven-plugin.version>1.3.0</flatten-maven-plugin.version> + <maven-git-commit-id-plugin.version>4.0.4</maven-git-commit-id-plugin.version> <testcontainer.version>1.19.6</testcontainer.version> <awaitility.version>4.2.0</awaitility.version> - <spotless.version>2.29.0</spotless.version> + <spotless.version>2.40.0</spotless.version> <slf4j.version>1.7.25</slf4j.version> <log4j2.version>2.17.1</log4j2.version> <log4j2-disruptor.version>3.4.4</log4j2-disruptor.version> @@ -66,9 +69,15 @@ <hazelcast.version>5.1</hazelcast.version> <quartz.version>2.3.2</quartz.version> <hadoop.version>2.7.1</hadoop.version> - <!--Option config--> + <!--Option config --> + <skipUT>false</skipUT> + <!-- skip unit test --> + <skipIT>true</skipIT> + <!-- skip integration test --> <test.dependency.skip>true</test.dependency.skip> <skip.spotless>true</skip.spotless> + <skip.checkstyle>true</skip.checkstyle> + <skip.spotbugs>true</skip.spotbugs> <flink.scope>provided</flink.scope> <grootstream.shaded.package>com.geedgenetworks.shaded</grootstream.shaded.package> <auto-service.version>1.0.1</auto-service.version> @@ -119,7 +128,6 @@ <version>${log4j2.version}</version> </dependency> - <!-- Exclude the logging bridges via provided scope --> <!-- log4j1.x bridge to slf4j Use of the SLF4J adapter (log4j-over-slf4j) together with the SLF4J bridge (slf4j-log4j12) should never be attempted as it will cause events to endlessly be routed between SLF4J and Log4j 1 @@ -262,7 +270,6 @@ <scope>import</scope> </dependency> - <dependency> <groupId>org.antlr</groupId> <artifactId>antlr4</artifactId> @@ -379,7 +386,6 @@ <version>${aviator.version}</version> </dependency> - <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> @@ -491,21 +497,226 @@ </dependency> <!-- ***************** slf4j & provider & bridges end ***************** --> - </dependencies> <build> <finalName>${project.artifactId}-${project.version}</finalName> - <plugins> + <pluginManagement> + <plugins> + <!-- java compiler --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>${maven-compiler-plugin.version}</version> + <configuration> + <source>${maven.compiler.source}</source> + <target>${maven.compiler.target}</target> + <forceJavacCompilerUse>true</forceJavacCompilerUse> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>3.1.2</version> + <configuration> + <skip>${skipUT}</skip> + <systemPropertyVariables> + <jacoco-agent.destfile>${project.build.directory}/jacoco.exec</jacoco-agent.destfile> + </systemPropertyVariables> + <excludes> + <exclude>**/*IT.java</exclude> + </excludes> + <classpathDependencyExcludes> + <!-- + The logger provider & bridges declared under 'provided' scope should be explicitly excluded from testing as below. + --> + <classpathDependencyExclude>org.slf4j:slf4j-jdk14</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:slf4j-jcl</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:slf4j-nop</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:slf4j-simple</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:slf4j-reload4j</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:slf4j-log4j12</classpathDependencyExclude> + <classpathDependencyExclude>org.slf4j:log4j-over-slf4j</classpathDependencyExclude> + <classpathDependencyExclude>commons-logging:commons-logging</classpathDependencyExclude> + <classpathDependencyExclude>log4j:log4j</classpathDependencyExclude> + <classpathDependencyExclude>ch.qos.logback:logback-classic</classpathDependencyExclude> + <classpathDependencyExclude>ch.qos.logback:logback-core</classpathDependencyExclude> + <classpathDependencyExclude>org.apache.logging.log4j:log4j-to-slf4j</classpathDependencyExclude> + </classpathDependencyExcludes> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>3.1.2</version> + <configuration> + <skip>${skipIT}</skip> + </configuration> + <executions> + <execution> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + <version>${maven-source-plugin.version}</version> + <executions> + <execution> + <id>attach-sources</id> + <goals> + <goal>jar-no-fork</goal> + </goals> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>pl.project13.maven</groupId> + <artifactId>git-commit-id-plugin</artifactId> + <version>${maven-git-commit-id-plugin.version}</version> + </plugin> + + <!-- make sure that flatten runs after shaded --> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>flatten-maven-plugin</artifactId> + <version>${flatten-maven-plugin.version}</version> + <configuration> + <updatePomFile>true</updatePomFile> + <flattenMode>resolveCiFriendliesOnly</flattenMode> + </configuration> + <executions> + <execution> + <id>flatten</id> + <goals> + <goal>flatten</goal> + </goals> + <phase>process-resources</phase> + </execution> + <execution> + <id>flatten.clean</id> + <goals> + <goal>clean</goal> + </goals> + <phase>clean</phase> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-enforcer-plugin</artifactId> + <version>3.0.0-M3</version> + <executions> + <execution> + <id>release-version-check</id> + <goals> + <goal>enforce</goal> + </goals> + <configuration> + <rules> + <requireReleaseVersion> + <message>SNAPSHOT versions ${project.version} are not allowed.</message> + </requireReleaseVersion> + </rules> + </configuration> + </execution> + <execution> + <id>snapshot-version-check</id> + <goals> + <goal>enforce</goal> + </goals> + <configuration> + <rules> + <requireSnapshotVersion> + <message>Non-SNAPSHOT versions ${project.version} are not allowed.</message> + </requireSnapshotVersion> + </rules> + </configuration> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.sonarsource.scanner.maven</groupId> + <artifactId>sonar-maven-plugin</artifactId> + <version>3.9.0.2155</version> + </plugin> + <plugin> + <groupId>org.commonjava.maven.plugins</groupId> + <artifactId>directory-maven-plugin</artifactId> + <version>1.0</version> + <executions> + <execution> + <id>directories</id> + <goals> + <goal>directory-of</goal> + </goals> + <phase>initialize</phase> + <configuration> + <property>rootDir</property> + <project> + <groupId>com.geedgenetworks</groupId> + <artifactId>groot-stream</artifactId> + </project> + </configuration> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-checkstyle-plugin</artifactId> + <version>3.1.2</version> + <configuration> + <skip>${skip.checkstyle}</skip> + <configLocation>${rootDir}/develop/checkstyle.xml</configLocation> + <suppressionsLocation>${rootDir}/develop/suppressions.xml</suppressionsLocation> + <includeTestSourceDirectory>true</includeTestSourceDirectory> + <logViolationsToConsole>true</logViolationsToConsole> + <failOnViolation>true</failOnViolation> + </configuration> + <dependencies> + <dependency> + <groupId>com.puppycrawl.tools</groupId> + <artifactId>checkstyle</artifactId> + <version>8.40</version> + </dependency> + </dependencies> + </plugin> + <plugin> + <groupId>com.github.spotbugs</groupId> + <artifactId>spotbugs-maven-plugin</artifactId> + <version>4.4.2.2</version> + <configuration> + <skip>${skip.spotbugs}</skip> + <threshold>Low</threshold> + <effort>default</effort> + <failOnError>true</failOnError> + <xmlOutput>true</xmlOutput> + <excludeFilterFile>${rootDir}/develop/spotbugs-exclude.xml</excludeFilterFile> + <spotbugsXmlOutputDirectory>${project.build.directory}/spotbugs</spotbugsXmlOutputDirectory> + </configuration> + </plugin> + + </plugins> + </pluginManagement> + + <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> - <version>${maven-compiler-plugin.version}</version> <configuration> - <source>${maven.compiler.source}</source> - <target>${maven.compiler.target}</target> - <forceJavacCompilerUse>true</forceJavacCompilerUse> + <encoding>UTF-8</encoding> </configuration> </plugin> @@ -513,31 +724,19 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.1</version> + <version>${maven-shade-plugin.version}</version> <configuration> <shadedArtifactAttached>false</shadedArtifactAttached> <createDependencyReducedPom>true</createDependencyReducedPom> <promoteTransitiveDependencies>true</promoteTransitiveDependencies> <artifactSet> <excludes> - <exclude>com.google.code.findbugs:jsr305</exclude> - <exclude>org.slf4j:slf4j-api</exclude> - <exclude>org.slf4j:slf4j-jdk14</exclude> - <exclude>org.slf4j:slf4j-jcl</exclude> - <exclude>org.slf4j:slf4j-nop</exclude> - <exclude>org.slf4j:slf4j-simple</exclude> - <exclude>org.slf4j:slf4j-reload4j</exclude> - <exclude>org.slf4j:slf4j-log4j12</exclude> - <exclude>org.slf4j:log4j-over-slf4j</exclude> - <exclude>org.slf4j:jcl-over-slf4j</exclude> + <exclude>com.google.code.findbugs:*</exclude> + <exclude>org.slf4j:*</exclude> + <exclude>ch.qos.logback:*</exclude> <exclude>log4j:*</exclude> <exclude>commons-logging:*</exclude> - <exclude>ch.qos.logback:*</exclude> - <exclude>org.apache.logging.log4j:log4j-api</exclude> - <exclude>org.apache.logging.log4j:log4j-core</exclude> - <exclude>org.apache.logging.log4j:log4j-slf4j-impl</exclude> - <exclude>org.apache.logging.log4j:log4j-1.2-api</exclude> - <exclude>org.apache.logging.log4j:log4j-to-slf4j</exclude> + <exclude>org.apache.logging.log4j:*</exclude> </excludes> </artifactSet> <filters> @@ -561,8 +760,7 @@ <phase>package</phase> <configuration> <transformers> - <transformer - implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> <!-- <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.geedgenetworks.bootstrap.main.GrootStreamServer</mainClass> </transformer>--> @@ -585,36 +783,103 @@ <version>${maven-helper-plugin.version}</version> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <version>${maven-dependency-plugin.version}</version> + </plugin> <plugin> - <groupId>org.codehaus.mojo</groupId> - <artifactId>flatten-maven-plugin</artifactId> - <version>${flatten-maven-plugin.version}</version> - <configuration> - <updatePomFile>true</updatePomFile> - <flattenMode>resolveCiFriendliesOnly</flattenMode> - </configuration> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.commonjava.maven.plugins</groupId> + <artifactId>directory-maven-plugin</artifactId> + </plugin> + + <plugin> + <groupId>com.github.spotbugs</groupId> + <artifactId>spotbugs-maven-plugin</artifactId> <executions> <execution> - <id>flatten</id> + <id>findbugs-main</id> <goals> - <goal>flatten</goal> + <goal>check</goal> </goals> - <phase>process-resources</phase> + <phase>compile</phase> </execution> <execution> - <id>flatten.clean</id> + <id>findbugs-test</id> <goals> - <goal>clean</goal> + <goal>check</goal> </goals> - <phase>clean</phase> + <phase>test-compile</phase> + <configuration> + <includeTests>true</includeTests> + </configuration> </execution> </executions> </plugin> + <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - <version>${maven-dependency-plugin.version}</version> + <artifactId>maven-checkstyle-plugin</artifactId> + <executions> + <execution> + <id>java-style-check</id> + <goals> + <goal>check</goal> + </goals> + <phase>compile</phase> + <configuration> + <sourceDirectories>src/main/java</sourceDirectories> + </configuration> + </execution> + <execution> + <id>java-test-style-check</id> + <goals> + <goal>check</goal> + </goals> + <phase>test-compile</phase> + <configuration> + <testSourceDirectories>src/test/java</testSourceDirectories> + <includeTestSourceDirectory>true</includeTestSourceDirectory> + </configuration> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.8.7</version> + <executions> + <execution> + <id>default-agent</id> + <goals> + <goal>prepare-agent</goal> + </goals> + <phase>initialize</phase> + </execution> + <execution> + <id>default-report</id> + <goals> + <goal>report</goal> + </goals> + <phase>verify</phase> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>flatten-maven-plugin</artifactId> </plugin> <plugin> @@ -628,11 +893,11 @@ <exclude>src/main/antlr4/*.*</exclude> </excludes> <googleJavaFormat> - <version>1.7</version> + <version>1.17.0</version> <style>AOSP</style> </googleJavaFormat> - <removeUnusedImports/> - <formatAnnotations/> + <removeUnusedImports /> + <formatAnnotations /> <importOrder> <order>com.geedgenetworks,org.apache,org,,javax,java,\#</order> </importOrder> @@ -680,7 +945,7 @@ <exclude>**/.github/**/*.md</exclude> <exclude>**/*.json</exclude> </excludes> - <flexmark/> + <flexmark /> </markdown> <upToDateChecking> <enabled>true</enabled> @@ -697,185 +962,8 @@ </executions> </plugin> - <plugin> - <groupId>org.commonjava.maven.plugins</groupId> - <artifactId>directory-maven-plugin</artifactId> - <executions> - <execution> - <id>directories</id> - <goals> - <goal>directory-of</goal> - </goals> - <phase>initialize</phase> - <configuration> - <property>rootDir</property> - <project> - <groupId>com.geedgenetworks</groupId> - <artifactId>groot-stream</artifactId> - </project> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-checkstyle-plugin</artifactId> - <executions> - <execution> - <id>java-style-check</id> - <phase>compile</phase> - <goals> - <goal>check</goal> - </goals> - <configuration> - <sourceDirectories>src/main/java</sourceDirectories> - </configuration> - </execution> - <execution> - <id>java-test-style-check</id> - <phase>test-compile</phase> - <goals> - <goal>check</goal> - </goals> - <configuration> - <testSourceDirectories>src/test/java</testSourceDirectories> - <includeTestSourceDirectory>true</includeTestSourceDirectory> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>com.github.spotbugs</groupId> - <artifactId>spotbugs-maven-plugin</artifactId> - <executions> - <execution> - <id>findbugs-main</id> - <phase>compile</phase> - <goals> - <goal>check</goal> - </goals> - </execution> - <execution> - <id>findbugs-test</id> - <phase>test-compile</phase> - <goals> - <goal>check</goal> - </goals> - <configuration> - <includeTests>true</includeTests> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.jacoco</groupId> - <artifactId>jacoco-maven-plugin</artifactId> - <version>0.8.7</version> - <executions> - <execution> - <id>default-agent</id> - <goals> - <goal>prepare-agent</goal> - </goals> - <phase>initialize</phase> - </execution> - <execution> - <id>default-report</id> - <phase>verify</phase> - <goals> - <goal>report</goal> - </goals> - </execution> - </executions> - </plugin> </plugins> - <pluginManagement> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-surefire-plugin</artifactId> - <version>3.1.2</version> - </plugin> - <plugin> - <groupId>org.commonjava.maven.plugins</groupId> - <artifactId>directory-maven-plugin</artifactId> - <version>1.0</version> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-checkstyle-plugin</artifactId> - <version>3.1.2</version> - <configuration> - <skip>true</skip> - <configLocation>${rootDir}/develop/checkstyle.xml</configLocation> - <suppressionsLocation>${rootDir}/develop/suppressions.xml</suppressionsLocation> - <includeTestSourceDirectory>true</includeTestSourceDirectory> - <logViolationsToConsole>true</logViolationsToConsole> - <failOnViolation>true</failOnViolation> - </configuration> - <dependencies> - <dependency> - <groupId>com.puppycrawl.tools</groupId> - <artifactId>checkstyle</artifactId> - <version>8.40</version> - </dependency> - </dependencies> - </plugin> - <plugin> - <groupId>com.github.spotbugs</groupId> - <artifactId>spotbugs-maven-plugin</artifactId> - <version>4.4.2.2</version> - <configuration> - <skip>true</skip> - <threshold>Low</threshold> - <effort>default</effort> - <failOnError>true</failOnError> - <xmlOutput>true</xmlOutput> - <excludeFilterFile>${rootDir}/develop/spotbugs-exclude.xml</excludeFilterFile> - <spotbugsXmlOutputDirectory>${project.build.directory}/spotbugs</spotbugsXmlOutputDirectory> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-enforcer-plugin</artifactId> - <version>3.0.0-M3</version> - <executions> - <execution> - <id>release-version-check</id> - <goals> - <goal>enforce</goal> - </goals> - <configuration> - <rules> - <requireReleaseVersion> - <message>SNAPSHOT versions ${project.version} are not allowed.</message> - </requireReleaseVersion> - </rules> - </configuration> - </execution> - <execution> - <id>snapshot-version-check</id> - <goals> - <goal>enforce</goal> - </goals> - <configuration> - <rules> - <requireSnapshotVersion> - <message>Non-SNAPSHOT versions ${project.version} are not allowed.</message> - </requireSnapshotVersion> - </rules> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.sonarsource.scanner.maven</groupId> - <artifactId>sonar-maven-plugin</artifactId> - <version>3.9.0.2155</version> - </plugin> - </plugins> - </pluginManagement> </build> <repositories> @@ -892,9 +980,9 @@ <distributionManagement> <repository> + <uniqueVersion>true</uniqueVersion> <id>platform-releases</id> <url>http://192.168.40.153:8081/repository/platform-release/</url> - <uniqueVersion>true</uniqueVersion> </repository> <snapshotRepository> <id>platform-snapshots</id> |
