summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordoufenghu <[email protected]>2024-10-18 16:41:28 +0800
committerdoufenghu <[email protected]>2024-10-18 16:41:28 +0800
commitfd54e003f5e852ad6735e400d3feca024dc5e5f3 (patch)
tree433e69c4f324939e323842745a241731f887d8eb
parent1c8baf9c355db3df000278a5e1d9860c5baf4635 (diff)
[Improve][docs] Add event model description.
-rw-r--r--config/udf.plugins8
-rw-r--r--docs/develop-guide.md22
-rw-r--r--groot-common/src/main/resources/udf.plugins11
-rw-r--r--groot-examples/cn-udf-example/src/main/resources/udf.plugins48
-rw-r--r--groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml2
5 files changed, 62 insertions, 29 deletions
diff --git a/config/udf.plugins b/config/udf.plugins
index 25ce168..45abeea 100644
--- a/config/udf.plugins
+++ b/config/udf.plugins
@@ -15,6 +15,9 @@ com.geedgenetworks.core.udf.Rename
com.geedgenetworks.core.udf.SnowflakeId
com.geedgenetworks.core.udf.StringJoiner
com.geedgenetworks.core.udf.UnixTimestampConverter
+com.geedgenetworks.core.udf.uuid.UUID
+com.geedgenetworks.core.udf.uuid.UUIDv5
+com.geedgenetworks.core.udf.uuid.UUIDv7
com.geedgenetworks.core.udf.udaf.NumberSum
com.geedgenetworks.core.udf.udaf.CollectList
com.geedgenetworks.core.udf.udaf.CollectSet
@@ -29,7 +32,4 @@ com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile
com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles
com.geedgenetworks.core.udf.udtf.JsonUnroll
com.geedgenetworks.core.udf.udtf.Unroll
-com.geedgenetworks.core.udf.udtf.PathUnroll
-com.geedgenetworks.core.udf.uuid.UUID
-com.geedgenetworks.core.udf.uuid.UUIDv5
-com.geedgenetworks.core.udf.uuid.UUIDv7
+com.geedgenetworks.core.udf.udtf.PathUnroll \ No newline at end of file
diff --git a/docs/develop-guide.md b/docs/develop-guide.md
index 2742cee..75e8803 100644
--- a/docs/develop-guide.md
+++ b/docs/develop-guide.md
@@ -15,6 +15,28 @@
| groot-docs | Docs module of groot-stream, which is responsible for providing documents. |
| groot-release | Release module of groot-stream, which is responsible for providing release scripts. |
+## Event Model
+Groot Stream based all stream processing on data records common known as events. A event is a collection of key-value pairs(fields). As follows:
+
+```json
+{
+ "__timestamp": "<Timestamp in UNIX epoch format (milliseconds)>",
+ "__input_id": "ID/Name of the source that delivered the event",
+ "__window_start_timestamp" : "<Timestamp in UNIX epoch format (milliseconds)>",
+ "__window_end_timestamp" : "<Timestamp in UNIX epoch format (milliseconds)>",
+ "key1": "<value1>",
+ "key2": "<value2>",
+ "keyN": "<valueN>"
+}
+```
+Groot Stream add internal fields during pipeline processing. A few notes about internal fields:
+- Internal fields start with a double underscore `__`.
+- Each source can add one or many internal fields to the each event. For example, the Kafka source adds both a `__timestamp` and a `__input_id` field.
+- Treat internal fields as read-only. Modifying them can result in unintended consequences to your data flows.
+- Internal fields only exist for the duration of the event processing pipeline. They are not documented under sources or sinks.
+- If you do not configure a timestamp for extraction, the Pipeline process assigns the current time (in UNIX epoch format) to the __timestamp field.
+- If you have multiple sources, you can determine which source the event came form by looking at the `__input_id` field. For example, the Kafka source adds the topic name to the `__input_id` field.
+
## How to write a high quality Git commit message
> [purpose] [module name] [sub-module name] Description (JIRA Issue ID)
diff --git a/groot-common/src/main/resources/udf.plugins b/groot-common/src/main/resources/udf.plugins
index 26a6754..45abeea 100644
--- a/groot-common/src/main/resources/udf.plugins
+++ b/groot-common/src/main/resources/udf.plugins
@@ -3,7 +3,9 @@ com.geedgenetworks.core.udf.CurrentUnixTimestamp
com.geedgenetworks.core.udf.DecodeBase64
com.geedgenetworks.core.udf.Domain
com.geedgenetworks.core.udf.Drop
+com.geedgenetworks.core.udf.EncodeBase64
com.geedgenetworks.core.udf.Eval
+com.geedgenetworks.core.udf.Flatten
com.geedgenetworks.core.udf.FromUnixTimestamp
com.geedgenetworks.core.udf.GenerateStringArray
com.geedgenetworks.core.udf.GeoIpLookup
@@ -13,7 +15,9 @@ com.geedgenetworks.core.udf.Rename
com.geedgenetworks.core.udf.SnowflakeId
com.geedgenetworks.core.udf.StringJoiner
com.geedgenetworks.core.udf.UnixTimestampConverter
-com.geedgenetworks.core.udf.Flatten
+com.geedgenetworks.core.udf.uuid.UUID
+com.geedgenetworks.core.udf.uuid.UUIDv5
+com.geedgenetworks.core.udf.uuid.UUIDv7
com.geedgenetworks.core.udf.udaf.NumberSum
com.geedgenetworks.core.udf.udaf.CollectList
com.geedgenetworks.core.udf.udaf.CollectSet
@@ -28,7 +32,4 @@ com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile
com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles
com.geedgenetworks.core.udf.udtf.JsonUnroll
com.geedgenetworks.core.udf.udtf.Unroll
-com.geedgenetworks.core.udf.udtf.PathUnroll
-com.geedgenetworks.core.udf.uuid.UUID
-com.geedgenetworks.core.udf.uuid.UUIDv5
-com.geedgenetworks.core.udf.uuid.UUIDv7 \ No newline at end of file
+com.geedgenetworks.core.udf.udtf.PathUnroll \ No newline at end of file
diff --git a/groot-examples/cn-udf-example/src/main/resources/udf.plugins b/groot-examples/cn-udf-example/src/main/resources/udf.plugins
index 6df805d..45abeea 100644
--- a/groot-examples/cn-udf-example/src/main/resources/udf.plugins
+++ b/groot-examples/cn-udf-example/src/main/resources/udf.plugins
@@ -1,25 +1,35 @@
-com.geedgenetworks.core.udf.SnowflakeId
-com.geedgenetworks.core.udf.UnixTimestampConverter
com.geedgenetworks.core.udf.AsnLookup
+com.geedgenetworks.core.udf.CurrentUnixTimestamp
+com.geedgenetworks.core.udf.DecodeBase64
+com.geedgenetworks.core.udf.Domain
+com.geedgenetworks.core.udf.Drop
+com.geedgenetworks.core.udf.EncodeBase64
com.geedgenetworks.core.udf.Eval
+com.geedgenetworks.core.udf.Flatten
+com.geedgenetworks.core.udf.FromUnixTimestamp
com.geedgenetworks.core.udf.GenerateStringArray
com.geedgenetworks.core.udf.GeoIpLookup
-com.geedgenetworks.core.udf.cn.L7ProtocolAndAppExtract
-com.geedgenetworks.core.udf.cn.IdcRenterLookup
-com.geedgenetworks.core.udf.cn.LinkDirectionLookup
-com.geedgenetworks.core.udf.cn.FqdnCategoryLookup
-com.geedgenetworks.core.udf.cn.IcpLookup
-com.geedgenetworks.core.udf.cn.FqdnWhoisLookup
-com.geedgenetworks.core.udf.cn.DnsServerInfoLookup
-com.geedgenetworks.core.udf.cn.AppCategoryLookup
-com.geedgenetworks.core.udf.cn.IpZoneLookup
-com.geedgenetworks.core.udf.cn.VpnLookup
-com.geedgenetworks.core.udf.cn.AnonymityLookup
-com.geedgenetworks.core.udf.cn.IocLookup
-com.geedgenetworks.core.udf.cn.UserDefineTagLookup
-com.geedgenetworks.core.udf.cn.FieldsMerge
-com.geedgenetworks.core.udf.cn.ArrayElementsPrepend
-com.geedgenetworks.core.udf.cn.IntelligenceIndicatorLookup
+com.geedgenetworks.core.udf.JsonExtract
+com.geedgenetworks.core.udf.PathCombine
+com.geedgenetworks.core.udf.Rename
+com.geedgenetworks.core.udf.SnowflakeId
+com.geedgenetworks.core.udf.StringJoiner
+com.geedgenetworks.core.udf.UnixTimestampConverter
com.geedgenetworks.core.udf.uuid.UUID
com.geedgenetworks.core.udf.uuid.UUIDv5
-com.geedgenetworks.core.udf.uuid.UUIDv7 \ No newline at end of file
+com.geedgenetworks.core.udf.uuid.UUIDv7
+com.geedgenetworks.core.udf.udaf.NumberSum
+com.geedgenetworks.core.udf.udaf.CollectList
+com.geedgenetworks.core.udf.udaf.CollectSet
+com.geedgenetworks.core.udf.udaf.LongCount
+com.geedgenetworks.core.udf.udaf.Mean
+com.geedgenetworks.core.udf.udaf.LastValue
+com.geedgenetworks.core.udf.udaf.FirstValue
+com.geedgenetworks.core.udf.udaf.hlld.Hlld
+com.geedgenetworks.core.udf.udaf.hlld.HlldApproxCountDistinct
+com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogram
+com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantile
+com.geedgenetworks.core.udf.udaf.HdrHistogram.HdrHistogramQuantiles
+com.geedgenetworks.core.udf.udtf.JsonUnroll
+com.geedgenetworks.core.udf.udtf.Unroll
+com.geedgenetworks.core.udf.udtf.PathUnroll \ No newline at end of file
diff --git a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml
index bf122b8..8736aee 100644
--- a/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml
+++ b/groot-examples/end-to-end-example/src/main/resources/examples/inline_to_print_test.yaml
@@ -43,7 +43,7 @@ processing_pipelines:
session_record_processor:
type: projection
remove_fields: [device_tag]
- output_fields: [log_id, client_ip, client_geolocation, client_asn, server_domain, server_ip, server_geolocation, server_asn, log_uuid, log_uuid_v7, ip_uuid]
+ #output_fields: [log_id, device_tag, client_ip, client_geolocation, client_asn, server_domain, server_ip, server_geolocation, server_asn, log_uuid, log_uuid_v7, ip_uuid]
functions:
- function: DROP
lookup_fields: []