diff options
| -rw-r--r-- | config/udf.plugins | 8 | ||||
| -rw-r--r-- | docs/develop-guide.md | 22 | ||||
| -rw-r--r-- | groot-common/src/main/resources/udf.plugins | 11 | ||||
| -rw-r--r-- | groot-examples/cn-udf-example/src/main/resources/udf.plugins | 48 | ||||
| -rw-r--r-- | groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml | 2 |
5 files changed, 62 insertions, 29 deletions
diff --git a/config/udf.plugins b/config/udf.plugins index 25ce168..45abeea 100644 --- a/config/udf.plugins +++ b/config/udf.plugins @@ -15,6 +15,9 @@ com.geedgenetworks.core.udf.Rename com.geedgenetworks.core.udf.SnowflakeId com.geedgenetworks.core.udf.StringJoiner com.geedgenetworks.core.udf.UnixTimestampConverter +com.geedgenetworks.core.udf.uuid.UUID +com.geedgenetworks.core.udf.uuid.UUIDv5 +com.geedgenetworks.core.udf.uuid.UUIDv7 com.geedgenetworks.core.udf.udaf.NumberSum com.geedgenetworks.core.udf.udaf.CollectList com.geedgenetworks.core.udf.udaf.CollectSet @@ -29,7 +32,4 @@ com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles com.geedgenetworks.core.udf.udtf.JsonUnroll com.geedgenetworks.core.udf.udtf.Unroll -com.geedgenetworks.core.udf.udtf.PathUnroll -com.geedgenetworks.core.udf.uuid.UUID -com.geedgenetworks.core.udf.uuid.UUIDv5 -com.geedgenetworks.core.udf.uuid.UUIDv7 +com.geedgenetworks.core.udf.udtf.PathUnroll
\ No newline at end of file diff --git a/docs/develop-guide.md b/docs/develop-guide.md index 2742cee..75e8803 100644 --- a/docs/develop-guide.md +++ b/docs/develop-guide.md @@ -15,6 +15,28 @@ | groot-docs | Docs module of groot-stream, which is responsible for providing documents. | | groot-release | Release module of groot-stream, which is responsible for providing release scripts. | +## Event Model +Groot Stream based all stream processing on data records common known as events. A event is a collection of key-value pairs(fields). As follows: + +```json +{ + "__timestamp": "<Timestamp in UNIX epoch format (milliseconds)>", + "__input_id": "ID/Name of the source that delivered the event", + "__window_start_timestamp" : "<Timestamp in UNIX epoch format (milliseconds)>", + "__window_end_timestamp" : "<Timestamp in UNIX epoch format (milliseconds)>", + "key1": "<value1>", + "key2": "<value2>", + "keyN": "<valueN>" +} +``` +Groot Stream add internal fields during pipeline processing. A few notes about internal fields: +- Internal fields start with a double underscore `__`. +- Each source can add one or many internal fields to the each event. For example, the Kafka source adds both a `__timestamp` and a `__input_id` field. +- Treat internal fields as read-only. Modifying them can result in unintended consequences to your data flows. +- Internal fields only exist for the duration of the event processing pipeline. They are not documented under sources or sinks. +- If you do not configure a timestamp for extraction, the Pipeline process assigns the current time (in UNIX epoch format) to the __timestamp field. +- If you have multiple sources, you can determine which source the event came form by looking at the `__input_id` field. For example, the Kafka source adds the topic name to the `__input_id` field. + ## How to write a high quality Git commit message > [purpose] [module name] [sub-module name] Description (JIRA Issue ID) diff --git a/groot-common/src/main/resources/udf.plugins b/groot-common/src/main/resources/udf.plugins index 26a6754..45abeea 100644 --- a/groot-common/src/main/resources/udf.plugins +++ b/groot-common/src/main/resources/udf.plugins @@ -3,7 +3,9 @@ com.geedgenetworks.core.udf.CurrentUnixTimestamp com.geedgenetworks.core.udf.DecodeBase64 com.geedgenetworks.core.udf.Domain com.geedgenetworks.core.udf.Drop +com.geedgenetworks.core.udf.EncodeBase64 com.geedgenetworks.core.udf.Eval +com.geedgenetworks.core.udf.Flatten com.geedgenetworks.core.udf.FromUnixTimestamp com.geedgenetworks.core.udf.GenerateStringArray com.geedgenetworks.core.udf.GeoIpLookup @@ -13,7 +15,9 @@ com.geedgenetworks.core.udf.Rename com.geedgenetworks.core.udf.SnowflakeId com.geedgenetworks.core.udf.StringJoiner com.geedgenetworks.core.udf.UnixTimestampConverter -com.geedgenetworks.core.udf.Flatten +com.geedgenetworks.core.udf.uuid.UUID +com.geedgenetworks.core.udf.uuid.UUIDv5 +com.geedgenetworks.core.udf.uuid.UUIDv7 com.geedgenetworks.core.udf.udaf.NumberSum com.geedgenetworks.core.udf.udaf.CollectList com.geedgenetworks.core.udf.udaf.CollectSet @@ -28,7 +32,4 @@ com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles com.geedgenetworks.core.udf.udtf.JsonUnroll com.geedgenetworks.core.udf.udtf.Unroll -com.geedgenetworks.core.udf.udtf.PathUnroll -com.geedgenetworks.core.udf.uuid.UUID -com.geedgenetworks.core.udf.uuid.UUIDv5 -com.geedgenetworks.core.udf.uuid.UUIDv7
\ No newline at end of file +com.geedgenetworks.core.udf.udtf.PathUnroll
\ No newline at end of file diff --git a/groot-examples/cn-udf-example/src/main/resources/udf.plugins b/groot-examples/cn-udf-example/src/main/resources/udf.plugins index 6df805d..45abeea 100644 --- a/groot-examples/cn-udf-example/src/main/resources/udf.plugins +++ b/groot-examples/cn-udf-example/src/main/resources/udf.plugins @@ -1,25 +1,35 @@ -com.geedgenetworks.core.udf.SnowflakeId -com.geedgenetworks.core.udf.UnixTimestampConverter com.geedgenetworks.core.udf.AsnLookup +com.geedgenetworks.core.udf.CurrentUnixTimestamp +com.geedgenetworks.core.udf.DecodeBase64 +com.geedgenetworks.core.udf.Domain +com.geedgenetworks.core.udf.Drop +com.geedgenetworks.core.udf.EncodeBase64 com.geedgenetworks.core.udf.Eval +com.geedgenetworks.core.udf.Flatten +com.geedgenetworks.core.udf.FromUnixTimestamp com.geedgenetworks.core.udf.GenerateStringArray com.geedgenetworks.core.udf.GeoIpLookup -com.geedgenetworks.core.udf.cn.L7ProtocolAndAppExtract -com.geedgenetworks.core.udf.cn.IdcRenterLookup -com.geedgenetworks.core.udf.cn.LinkDirectionLookup -com.geedgenetworks.core.udf.cn.FqdnCategoryLookup -com.geedgenetworks.core.udf.cn.IcpLookup -com.geedgenetworks.core.udf.cn.FqdnWhoisLookup -com.geedgenetworks.core.udf.cn.DnsServerInfoLookup -com.geedgenetworks.core.udf.cn.AppCategoryLookup -com.geedgenetworks.core.udf.cn.IpZoneLookup -com.geedgenetworks.core.udf.cn.VpnLookup -com.geedgenetworks.core.udf.cn.AnonymityLookup -com.geedgenetworks.core.udf.cn.IocLookup -com.geedgenetworks.core.udf.cn.UserDefineTagLookup -com.geedgenetworks.core.udf.cn.FieldsMerge -com.geedgenetworks.core.udf.cn.ArrayElementsPrepend -com.geedgenetworks.core.udf.cn.IntelligenceIndicatorLookup +com.geedgenetworks.core.udf.JsonExtract +com.geedgenetworks.core.udf.PathCombine +com.geedgenetworks.core.udf.Rename +com.geedgenetworks.core.udf.SnowflakeId +com.geedgenetworks.core.udf.StringJoiner +com.geedgenetworks.core.udf.UnixTimestampConverter com.geedgenetworks.core.udf.uuid.UUID com.geedgenetworks.core.udf.uuid.UUIDv5 -com.geedgenetworks.core.udf.uuid.UUIDv7
\ No newline at end of file +com.geedgenetworks.core.udf.uuid.UUIDv7 +com.geedgenetworks.core.udf.udaf.NumberSum +com.geedgenetworks.core.udf.udaf.CollectList +com.geedgenetworks.core.udf.udaf.CollectSet +com.geedgenetworks.core.udf.udaf.LongCount +com.geedgenetworks.core.udf.udaf.Mean +com.geedgenetworks.core.udf.udaf.LastValue +com.geedgenetworks.core.udf.udaf.FirstValue +com.geedgenetworks.core.udf.udaf.hlld.Hlld +com.geedgenetworks.core.udf.udaf.hlld.HlldApproxCountDistinct +com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogram +com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile +com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles +com.geedgenetworks.core.udf.udtf.JsonUnroll +com.geedgenetworks.core.udf.udtf.Unroll +com.geedgenetworks.core.udf.udtf.PathUnroll
\ No newline at end of file diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml index bf122b8..8736aee 100644 --- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml +++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml @@ -43,7 +43,7 @@ processing_pipelines: session_record_processor: type: projection remove_fields: [device_tag] - output_fields: [log_id, client_ip, client_geolocation, client_asn, server_domain, server_ip, server_geolocation, server_asn, log_uuid, log_uuid_v7, ip_uuid] + #output_fields: [log_id, device_tag, client_ip, client_geolocation, client_asn, server_domain, server_ip, server_geolocation, server_asn, log_uuid, log_uuid_v7, ip_uuid] functions: - function: DROP lookup_fields: [] |
