From 30c0a0c7b930585cd283dc44b833c8c597a03a6d Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Fri, 6 Feb 2026 10:46:41 +0100 Subject: [PATCH 01/20] Vendor `opentelemetry-proto` with the necessary files The proto files are ported from opentelemetry-proto 1.10.0[^1] and reduced only to the necessary files (all other files are discareded). [^1]: https://github.com/open-telemetry/opentelemetry-proto/tree/v1.10.0 --- third-party/opentelemetry-proto/LICENSE | 201 +++++ .../metrics/v1/metrics_service.proto | 77 ++ .../proto/common/v1/common.proto | 154 ++++ .../proto/metrics/v1/metrics.proto | 735 ++++++++++++++++++ .../proto/resource/v1/resource.proto | 45 ++ 5 files changed, 1212 insertions(+) create mode 100644 third-party/opentelemetry-proto/LICENSE create mode 100644 third-party/opentelemetry-proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto create mode 100644 third-party/opentelemetry-proto/opentelemetry/proto/common/v1/common.proto create mode 100644 third-party/opentelemetry-proto/opentelemetry/proto/metrics/v1/metrics.proto create mode 100644 third-party/opentelemetry-proto/opentelemetry/proto/resource/v1/resource.proto diff --git a/third-party/opentelemetry-proto/LICENSE b/third-party/opentelemetry-proto/LICENSE new file mode 100644 index 00000000000..261eeb9e9f8 --- /dev/null +++ b/third-party/opentelemetry-proto/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third-party/opentelemetry-proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto b/third-party/opentelemetry-proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto new file mode 100644 index 00000000000..bc024284415 --- /dev/null +++ b/third-party/opentelemetry-proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto @@ -0,0 +1,77 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "opentelemetry/proto/metrics/v1/metrics.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +// Service that can be used to push metrics between one Application +// instrumented with OpenTelemetry and a collector, or between a collector and a +// central collector. +service MetricsService { + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + // The number of rejected data points. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_data_points = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/third-party/opentelemetry-proto/opentelemetry/proto/common/v1/common.proto b/third-party/opentelemetry-proto/opentelemetry/proto/common/v1/common.proto new file mode 100644 index 00000000000..c7e3c4687de --- /dev/null +++ b/third-party/opentelemetry-proto/opentelemetry/proto/common/v1/common.proto @@ -0,0 +1,154 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.common.v1; + +option csharp_namespace = "OpenTelemetry.Proto.Common.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.common.v1"; +option java_outer_classname = "CommonProto"; +option go_package = "go.opentelemetry.io/proto/otlp/common/v1"; + +// Represents any type of attribute value. AnyValue may contain a +// primitive value such as a string or integer or it may contain an arbitrary nested +// object containing arrays, key-value lists and primitives. +message AnyValue { + // The value is one of the listed fields. It is valid for all values to be unspecified + // in which case this AnyValue is considered to be "empty". + oneof value { + string string_value = 1; + bool bool_value = 2; + int64 int_value = 3; + double double_value = 4; + ArrayValue array_value = 5; + KeyValueList kvlist_value = 6; + bytes bytes_value = 7; + // Reference to the string value in ProfilesDictionary.string_table. + // + // Note: This is currently used exclusively in the Profiling signal. + // Implementers of OTLP receivers for signals other than Profiling should + // treat the presence of this value as a non-fatal issue. + // Log an error or warning indicating an unexpected field intended for the + // Profiling signal and process the data as if this value were absent or + // empty, ignoring its semantic content for the non-Profiling signal. + // + // Status: [Development] + int32 string_value_strindex = 8; + } +} + +// ArrayValue is a list of AnyValue messages. We need ArrayValue as a message +// since oneof in AnyValue does not allow repeated fields. +message ArrayValue { + // Array of values. The array may be empty (contain 0 elements). + repeated AnyValue values = 1; +} + +// KeyValueList is a list of KeyValue messages. We need KeyValueList as a message +// since `oneof` in AnyValue does not allow repeated fields. Everywhere else where we need +// a list of KeyValue messages (e.g. in Span) we use `repeated KeyValue` directly to +// avoid unnecessary extra wrapping (which slows down the protocol). The 2 approaches +// are semantically equivalent. +message KeyValueList { + // A collection of key/value pairs of key-value pairs. The list may be empty (may + // contain 0 elements). + // + // The keys MUST be unique (it is not allowed to have more than one + // value with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated KeyValue values = 1; +} + +// Represents a key-value pair that is used to store Span attributes, Link +// attributes, etc. +message KeyValue { + // The key name of the pair. + // key_ref MUST NOT be set if key is used. + string key = 1; + + // The value of the pair. + AnyValue value = 2; + + // Reference to the string key in ProfilesDictionary.string_table. + // key MUST NOT be set if key_strindex is used. + // + // Note: This is currently used exclusively in the Profiling signal. + // Implementers of OTLP receivers for signals other than Profiling should + // treat the presence of this key as a non-fatal issue. + // Log an error or warning indicating an unexpected field intended for the + // Profiling signal and process the data as if this value were absent or + // empty, ignoring its semantic content for the non-Profiling signal. + // + // Status: [Development] + int32 key_strindex = 3; +} + +// InstrumentationScope is a message representing the instrumentation scope information +// such as the fully qualified name and version. +message InstrumentationScope { + // A name denoting the Instrumentation scope. + // An empty instrumentation scope name means the name is unknown. + string name = 1; + + // Defines the version of the instrumentation scope. + // An empty instrumentation scope version means the version is unknown. + string version = 2; + + // Additional attributes that describe the scope. [Optional]. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated KeyValue attributes = 3; + + // The number of attributes that were discarded. Attributes + // can be discarded because their keys are too long or because there are too many + // attributes. If this value is 0, then no attributes were dropped. + uint32 dropped_attributes_count = 4; +} + +// A reference to an Entity. +// Entity represents an object of interest associated with produced telemetry: e.g spans, metrics, profiles, or logs. +// +// Status: [Development] +message EntityRef { + // The Schema URL, if known. This is the identifier of the Schema that the entity data + // is recorded in. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // + // This schema_url applies to the data in this message and to the Resource attributes + // referenced by id_keys and description_keys. + // TODO: discuss if we are happy with this somewhat complicated definition of what + // the schema_url applies to. + // + // This field obsoletes the schema_url field in ResourceMetrics/ResourceSpans/ResourceLogs. + string schema_url = 1; + + // Defines the type of the entity. MUST not change during the lifetime of the entity. + // For example: "service" or "host". This field is required and MUST not be empty + // for valid entities. + string type = 2; + + // Attribute Keys that identify the entity. + // MUST not change during the lifetime of the entity. The Id must contain at least one attribute. + // These keys MUST exist in the containing {message}.attributes. + repeated string id_keys = 3; + + // Descriptive (non-identifying) attribute keys of the entity. + // MAY change over the lifetime of the entity. MAY be empty. + // These attribute keys are not part of entity's identity. + // These keys MUST exist in the containing {message}.attributes. + repeated string description_keys = 4; +} \ No newline at end of file diff --git a/third-party/opentelemetry-proto/opentelemetry/proto/metrics/v1/metrics.proto b/third-party/opentelemetry-proto/opentelemetry/proto/metrics/v1/metrics.proto new file mode 100644 index 00000000000..a6fab4ee750 --- /dev/null +++ b/third-party/opentelemetry-proto/opentelemetry/proto/metrics/v1/metrics.proto @@ -0,0 +1,735 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.metrics.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.metrics.v1"; +option java_outer_classname = "MetricsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/metrics/v1"; + +// MetricsData represents the metrics data that can be stored in a persistent +// storage, OR can be embedded by other protocols that transfer OTLP metrics +// data but do not implement the OTLP protocol. +// +// MetricsData +// └─── ResourceMetrics +// ├── Resource +// ├── SchemaURL +// └── ScopeMetrics +// ├── Scope +// ├── SchemaURL +// └── Metric +// ├── Name +// ├── Description +// ├── Unit +// └── data +// ├── Gauge +// ├── Sum +// ├── Histogram +// ├── ExponentialHistogram +// └── Summary +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message MetricsData { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceMetrics resource_metrics = 1; +} + +// A collection of ScopeMetrics from a Resource. +message ResourceMetrics { + reserved 1000; + + // The resource for the metrics in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of metrics that originate from a resource. + repeated ScopeMetrics scope_metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_metrics" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Metrics produced by an Scope. +message ScopeMetrics { + // The instrumentation scope information for the metrics in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of metrics that originate from an instrumentation library. + repeated Metric metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the metric data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "scope" field and all metrics in the + // "metrics" field. + string schema_url = 3; +} + +// Defines a Metric which has one or more timeseries. The following is a +// brief summary of the Metric data model. For more details, see: +// +// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md +// +// The data model and relation between entities is shown in the +// diagram below. Here, "DataPoint" is the term used to refer to any +// one of the specific data point value types, and "points" is the term used +// to refer to any one of the lists of points contained in the Metric. +// +// - Metric is composed of a metadata and data. +// - Metadata part contains a name, description, unit. +// - Data is one of the possible types (Sum, Gauge, Histogram, Summary). +// - DataPoint contains timestamps, attributes, and one of the possible value type +// fields. +// +// Metric +// +------------+ +// |name | +// |description | +// |unit | +------------------------------------+ +// |data |---> |Gauge, Sum, Histogram, Summary, ... | +// +------------+ +------------------------------------+ +// +// Data [One of Gauge, Sum, Histogram, Summary, ...] +// +-----------+ +// |... | // Metadata about the Data. +// |points |--+ +// +-----------+ | +// | +---------------------------+ +// | |DataPoint 1 | +// v |+------+------+ +------+ | +// +-----+ ||label |label |...|label | | +// | 1 |-->||value1|value2|...|valueN| | +// +-----+ |+------+------+ +------+ | +// | . | |+-----+ | +// | . | ||value| | +// | . | |+-----+ | +// | . | +---------------------------+ +// | . | . +// | . | . +// | . | . +// | . | +---------------------------+ +// | . | |DataPoint M | +// +-----+ |+------+------+ +------+ | +// | M |-->||label |label |...|label | | +// +-----+ ||value1|value2|...|valueN| | +// |+------+------+ +------+ | +// |+-----+ | +// ||value| | +// |+-----+ | +// +---------------------------+ +// +// Each distinct type of DataPoint represents the output of a specific +// aggregation function, the result of applying the DataPoint's +// associated function of to one or more measurements. +// +// All DataPoint types have three common fields: +// - Attributes includes key-value pairs associated with the data point +// - TimeUnixNano is required, set to the end time of the aggregation +// - StartTimeUnixNano is optional, but strongly encouraged for DataPoints +// having an AggregationTemporality field, as discussed below. +// +// Both TimeUnixNano and StartTimeUnixNano values are expressed as +// UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. +// +// # TimeUnixNano +// +// This field is required, having consistent interpretation across +// DataPoint types. TimeUnixNano is the moment corresponding to when +// the data point's aggregate value was captured. +// +// Data points with the 0 value for TimeUnixNano SHOULD be rejected +// by consumers. +// +// # StartTimeUnixNano +// +// StartTimeUnixNano in general allows detecting when a sequence of +// observations is unbroken. This field indicates to consumers the +// start time for points with cumulative and delta +// AggregationTemporality, and it should be included whenever possible +// to support correct rate calculation. Although it may be omitted +// when the start time is truly unknown, setting StartTimeUnixNano is +// strongly encouraged. +message Metric { + reserved 4, 6, 8; + + // The name of the metric. + string name = 1; + + // A description of the metric, which can be used in documentation. + string description = 2; + + // The unit in which the metric value is reported. Follows the format + // described by https://unitsofmeasure.org/ucum.html. + string unit = 3; + + // Data determines the aggregation type (if any) of the metric, what is the + // reported value type for the data points, as well as the relatationship to + // the time interval over which they are reported. + oneof data { + Gauge gauge = 5; + Sum sum = 7; + Histogram histogram = 9; + ExponentialHistogram exponential_histogram = 10; + Summary summary = 11; + } + + // Additional metadata attributes that describe the metric. [Optional]. + // Attributes are non-identifying. + // Consumers SHOULD NOT need to be aware of these attributes. + // These attributes MAY be used to encode information allowing + // for lossless roundtrip translation to / from another data model. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue metadata = 12; +} + +// Gauge represents the type of a scalar metric that always exports the +// "current value" for every data point. It should be used for an "unknown" +// aggregation. +// +// A Gauge does not support different aggregation temporalities. Given the +// aggregation is unknown, points cannot be combined using the same +// aggregation, regardless of aggregation temporalities. Therefore, +// AggregationTemporality is not included. Consequently, this also means +// "StartTimeUnixNano" is ignored for all data points. +message Gauge { + // The time series data points. + // Note: Multiple time series may be included (same timestamp, different attributes). + repeated NumberDataPoint data_points = 1; +} + +// Sum represents the type of a scalar metric that is calculated as a sum of all +// reported measurements over a time interval. +message Sum { + // The time series data points. + // Note: Multiple time series may be included (same timestamp, different attributes). + repeated NumberDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; + + // Represents whether the sum is monotonic. + bool is_monotonic = 3; +} + +// Histogram represents the type of a metric that is calculated by aggregating +// as a Histogram of all reported measurements over a time interval. +message Histogram { + // The time series data points. + // Note: Multiple time series may be included (same timestamp, different attributes). + repeated HistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// ExponentialHistogram represents the type of a metric that is calculated by aggregating +// as a ExponentialHistogram of all reported double measurements over a time interval. +message ExponentialHistogram { + // The time series data points. + // Note: Multiple time series may be included (same timestamp, different attributes). + repeated ExponentialHistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// Summary metric data are used to convey quantile summaries, +// a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) +// and OpenMetrics (see: https://github.com/prometheus/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) +// data type. These data points cannot always be merged in a meaningful way. +// While they can be useful in some applications, histogram data points are +// recommended for new applications. +// Summary metrics do not have an aggregation temporality field. This is +// because the count and sum fields of a SummaryDataPoint are assumed to be +// cumulative values. +message Summary { + // The time series data points. + // Note: Multiple time series may be included (same timestamp, different attributes). + repeated SummaryDataPoint data_points = 1; +} + +// AggregationTemporality defines how a metric aggregator reports aggregated +// values. It describes how those values relate to the time interval over +// which they are aggregated. +enum AggregationTemporality { + // UNSPECIFIED is the default AggregationTemporality, it MUST not be used. + AGGREGATION_TEMPORALITY_UNSPECIFIED = 0; + + // DELTA is an AggregationTemporality for a metric aggregator which reports + // changes since last report time. Successive metrics contain aggregation of + // values from continuous and non-overlapping intervals. + // + // The values for a DELTA metric are based only on the time interval + // associated with one measurement cycle. There is no dependency on + // previous measurements like is the case for CUMULATIVE metrics. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // DELTA metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0+1 to + // t_0+2 with a value of 2. + AGGREGATION_TEMPORALITY_DELTA = 1; + + // CUMULATIVE is an AggregationTemporality for a metric aggregator which + // reports changes since a fixed start time. This means that current values + // of a CUMULATIVE metric depend on all previous measurements since the + // start time. Because of this, the sender is required to retain this state + // in some form. If this state is lost or invalidated, the CUMULATIVE metric + // values MUST be reset and a new fixed start time following the last + // reported measurement time sent MUST be used. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // CUMULATIVE metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+2 with a value of 5. + // 9. The system experiences a fault and loses state. + // 10. The system recovers and resumes receiving at time=t_1. + // 11. A request is received, the system measures 1 request. + // 12. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_1 to + // t_0+1 with a value of 1. + // + // Note: Even though, when reporting changes since last report time, using + // CUMULATIVE is valid, it is not recommended. This may cause problems for + // systems that do not use start_time to determine when the aggregation + // value was reset (e.g. Prometheus). + AGGREGATION_TEMPORALITY_CUMULATIVE = 2; +} + +// DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a +// bit-field representing 32 distinct boolean flags. Each flag defined in this +// enum is a bit-mask. To test the presence of a single flag in the flags of +// a data point, for example, use an expression like: +// +// (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +// +enum DataPointFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + DATA_POINT_FLAGS_DO_NOT_USE = 0; + + // This DataPoint is valid but has no recorded value. This value + // SHOULD be used to reflect explicitly missing data in a series, as + // for an equivalent to the Prometheus "staleness marker". + DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1; + + // Bits 2-31 are reserved for future use. +} + +// NumberDataPoint is a single data point in a timeseries that describes the +// time-varying scalar value of a metric. +message NumberDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The value itself. A point is considered invalid when one of the recognized + // value fields is not present inside this oneof. + oneof value { + double as_double = 4; + sfixed64 as_int = 6; + } + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 5; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// HistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Histogram. A Histogram contains summary statistics +// for a population of values, it may optionally contain the distribution of +// those values across a set of buckets. +// +// If the histogram contains the distribution of values, then both +// "explicit_bounds" and "bucket counts" fields must be defined. +// If the histogram does not contain the distribution of values, then both +// "explicit_bounds" and "bucket_counts" must be omitted and only "count" and +// "sum" are known. +message HistogramDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. This + // value must be equal to the sum of the "count" fields in buckets if a + // histogram is provided. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // bucket_counts is an optional field contains the count values of histogram + // for each bucket. + // + // The sum of the bucket_counts must equal the value in the count field. + // + // The number of elements in bucket_counts array must be by one greater than + // the number of elements in explicit_bounds array. The exception to this rule + // is when the length of bucket_counts is 0, then the length of explicit_bounds + // must also be 0. + repeated fixed64 bucket_counts = 6; + + // explicit_bounds specifies buckets with explicitly defined bounds for values. + // + // The boundaries for bucket at index i are: + // + // (-infinity, explicit_bounds[i]] for i == 0 + // (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + // (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + // + // The values in the explicit_bounds array must be strictly increasing. + // + // Histogram buckets are inclusive of their upper boundary, except the last + // bucket where the boundary is at infinity. This format is intentionally + // compatible with the OpenMetrics histogram definition. + // + // If bucket_counts length is 0 then explicit_bounds length must also be 0, + // otherwise the data point is invalid. + repeated double explicit_bounds = 7; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 8; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // min is the minimum value over (start_time, end_time]. + optional double min = 11; + + // max is the maximum value over (start_time, end_time]. + optional double max = 12; +} + +// ExponentialHistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains +// summary statistics for a population of values, it may optionally contain the +// distribution of those values across a set of buckets. +// +message ExponentialHistogramDataPoint { + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The number of values in the population. Must be + // non-negative. This value must be equal to the sum of the "bucket_counts" + // values in the positive and negative Buckets plus the "zero_count" field. + fixed64 count = 4; + + // The sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // scale describes the resolution of the histogram. Boundaries are + // located at powers of the base, where: + // + // base = (2^(2^-scale)) + // + // The histogram bucket identified by `index`, a signed integer, + // contains values that are greater than (base^index) and + // less than or equal to (base^(index+1)). + // + // The positive and negative ranges of the histogram are expressed + // separately. Negative values are mapped by their absolute value + // into the negative range using the same scale as the positive range. + // + // scale is not restricted by the protocol, as the permissible + // values depend on the range of the data. + sint32 scale = 6; + + // The count of values that are either exactly zero or + // within the region considered zero by the instrumentation at the + // tolerated degree of precision. This bucket stores values that + // cannot be expressed using the standard exponential formula as + // well as values that have been rounded to zero. + // + // Implementations MAY consider the zero bucket to have probability + // mass equal to (zero_count / count). + fixed64 zero_count = 7; + + // positive carries the positive range of exponential bucket counts. + Buckets positive = 8; + + // negative carries the negative range of exponential bucket counts. + Buckets negative = 9; + + // Buckets are a set of bucket counts, encoded in a contiguous array + // of counts. + message Buckets { + // The bucket index of the first entry in the bucket_counts array. + // + // Note: This uses a varint encoding as a simple form of compression. + sint32 offset = 1; + + // An array of count values, where bucket_counts[i] carries + // the count of the bucket at index (offset+i). bucket_counts[i] is the count + // of values greater than base^(offset+i) and less than or equal to + // base^(offset+i+1). + // + // Note: By contrast, the explicit HistogramDataPoint uses + // fixed64. This field is expected to have many buckets, + // especially zeros, so uint64 has been selected to ensure + // varint encoding. + repeated uint64 bucket_counts = 2; + } + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 11; + + // The minimum value over (start_time, end_time]. + optional double min = 12; + + // The maximum value over (start_time, end_time]. + optional double max = 13; + + // ZeroThreshold may be optionally set to convey the width of the zero + // region. Where the zero region is defined as the closed interval + // [-ZeroThreshold, ZeroThreshold]. + // When ZeroThreshold is 0, zero count bucket stores values that cannot be + // expressed using the standard exponential formula as well as values that + // have been rounded to zero. + double zero_threshold = 14; +} + +// SummaryDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Summary metric. The count and sum fields represent +// cumulative values. +message SummaryDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#summary + double sum = 5; + + // Represents the value at a given quantile of a distribution. + // + // To record Min and Max values following conventions are used: + // - The 1.0 quantile is equivalent to the maximum value observed. + // - The 0.0 quantile is equivalent to the minimum value observed. + // + // See the following issue for more context: + // https://github.com/open-telemetry/opentelemetry-proto/issues/125 + message ValueAtQuantile { + // The quantile of a distribution. Must be in the interval + // [0.0, 1.0]. + double quantile = 1; + + // The value at the given quantile of a distribution. + // + // Quantile values must NOT be negative. + double value = 2; + } + + // (Optional) list of values at different quantiles of the distribution calculated + // from the current snapshot. The quantiles must be strictly increasing. + repeated ValueAtQuantile quantile_values = 6; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// A representation of an exemplar, which is a sample input measurement. +// Exemplars also hold information about the environment when the measurement +// was recorded, for example the span and trace ID of the active span when the +// exemplar was recorded. +message Exemplar { + reserved 1; + + // The set of key/value pairs that were filtered out by the aggregator, but + // recorded alongside the original measurement. Only key/value pairs that were + // filtered out by the aggregator should be included + repeated opentelemetry.proto.common.v1.KeyValue filtered_attributes = 7; + + // time_unix_nano is the exact time when this exemplar was recorded + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 2; + + // The value of the measurement that was recorded. An exemplar is + // considered invalid when one of the recognized value fields is not present + // inside this oneof. + oneof value { + double as_double = 3; + sfixed64 as_int = 6; + } + + // (Optional) Span ID of the exemplar trace. + // span_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes span_id = 4; + + // (Optional) Trace ID of the exemplar trace. + // trace_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes trace_id = 5; +} diff --git a/third-party/opentelemetry-proto/opentelemetry/proto/resource/v1/resource.proto b/third-party/opentelemetry-proto/opentelemetry/proto/resource/v1/resource.proto new file mode 100644 index 00000000000..42c5913cfae --- /dev/null +++ b/third-party/opentelemetry-proto/opentelemetry/proto/resource/v1/resource.proto @@ -0,0 +1,45 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.resource.v1; + +import "opentelemetry/proto/common/v1/common.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Resource.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.resource.v1"; +option java_outer_classname = "ResourceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/resource/v1"; + +// Resource information. +message Resource { + // Set of attributes that describe the resource. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + // The behavior of software that receives duplicated keys can be unpredictable. + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // The number of dropped attributes. If the value is 0, then + // no attributes were dropped. + uint32 dropped_attributes_count = 2; + + // Set of entities that participate in this Resource. + // + // Note: keys in the references MUST exist in attributes of this message. + // + // Status: [Development] + repeated opentelemetry.proto.common.v1.EntityRef entity_refs = 3; +} From 374cc6e282d9fe3830ad2e8cee7576143d140fd4 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 24 Mar 2026 12:03:22 +0100 Subject: [PATCH 02/20] Cache Icinga DB env_id in `Application` class as well So that other components can use it without having to import any Icinga DB related header files, but only the base library. --- lib/base/application-environment.cpp | 28 ++++++++++++++++++++++++++++ lib/base/application.hpp | 4 ++++ lib/icingadb/icingadb.cpp | 1 + 3 files changed, 33 insertions(+) diff --git a/lib/base/application-environment.cpp b/lib/base/application-environment.cpp index b310d7247aa..245dc936f27 100644 --- a/lib/base/application-environment.cpp +++ b/lib/base/application-environment.cpp @@ -6,6 +6,8 @@ using namespace icinga; +AtomicOrLocked Application::m_EnvironmentId; + String Application::GetAppEnvironment() { Value defaultValue = Empty; @@ -16,3 +18,29 @@ void Application::SetAppEnvironment(const String& name) { ScriptGlobal::Set("Environment", name); } + +/** + * Get the cluster environment ID set by IcingaDB. + * + * This method returns the cluster environment ID generated by the IcingaDB component (if enabled). + * The environment ID is a unique identifier used to distinguish between different Icinga 2 clusters + * in a multi-cluster setup. It is typically set by IcingaDB when it starts up and can be used by other + * components (e.g., for telemetry) to correlate data across clusters. If IcingaDB is not enabled or has + * not yet set the environment ID, this method will return an empty string. + * + * @return The cluster environment ID set by IcingaDB, or an empty string if not set. + */ +String Application::GetEnvironmentId() +{ + return m_EnvironmentId.load(); +} + +/** + * Set the cluster environment ID. + * + * @param envID The cluster environment ID to set, typically generated by IcingaDB. + */ +void Application::SetEnvironmentId(const String& envID) +{ + m_EnvironmentId.store(envID); +} diff --git a/lib/base/application.hpp b/lib/base/application.hpp index f9fdecc3c14..feec84488c4 100644 --- a/lib/base/application.hpp +++ b/lib/base/application.hpp @@ -96,6 +96,8 @@ class Application : public ObjectImpl { static String GetAppEnvironment(); static void SetAppEnvironment(const String& name); + static String GetEnvironmentId(); + static void SetEnvironmentId(const String& envID); static double GetStartTime(); static void SetStartTime(double ts); @@ -130,6 +132,8 @@ class Application : public ObjectImpl { static pid_t m_ReloadProcess; /**< The PID of a subprocess doing a reload, only valid when l_Restarting==true */ static bool m_RequestReopenLogs; /**< Whether we should re-open log files. */ + static AtomicOrLocked m_EnvironmentId; /**< The cluster environment ID set by IcingaDB. */ + #ifndef _WIN32 static pid_t m_UmbrellaProcess; /**< The PID of the Icinga umbrella process */ #endif /* _WIN32 */ diff --git a/lib/icingadb/icingadb.cpp b/lib/icingadb/icingadb.cpp index aa63ee27d5b..dcd0c524b00 100644 --- a/lib/icingadb/icingadb.cpp +++ b/lib/icingadb/icingadb.cpp @@ -56,6 +56,7 @@ void IcingaDB::Validate(int types, const ValidationUtils& utils) try { InitEnvironmentId(); + Application::SetEnvironmentId(m_EnvironmentId); } catch (const std::exception& e) { BOOST_THROW_EXCEPTION(ValidationError(this, std::vector(), e.what())); } From 415140bc3671326dd817407dc4c16a4f972c2f2a Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 13 Jan 2026 14:04:01 +0100 Subject: [PATCH 03/20] Add common `OTel` type/lib --- CMakeLists.txt | 12 + icinga-app/CMakeLists.txt | 4 + lib/CMakeLists.txt | 4 + lib/otel/CMakeLists.txt | 43 +++ lib/otel/otel.cpp | 649 +++++++++++++++++++++++++++++++++++++ lib/otel/otel.hpp | 190 +++++++++++ lib/remote/httpmessage.cpp | 8 + lib/remote/httpmessage.hpp | 15 + test/CMakeLists.txt | 4 + 9 files changed, 929 insertions(+) create mode 100644 lib/otel/CMakeLists.txt create mode 100644 lib/otel/otel.cpp create mode 100644 lib/otel/otel.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ecdba05413..ed657d615bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,7 @@ option(ICINGA2_WITH_LIVESTATUS "Build the Livestatus module" ${ICINGA2_MASTER}) option(ICINGA2_WITH_NOTIFICATION "Build the notification module" ON) option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ${ICINGA2_MASTER}) option(ICINGA2_WITH_ICINGADB "Build the IcingaDB module" ${ICINGA2_MASTER}) +option(ICINGA2_WITH_OPENTELEMETRY "Build the OpenTelemetry integration module" ${ICINGA2_MASTER}) option (USE_SYSTEMD "Configure icinga as native systemd service instead of a SysV initscript" OFF) @@ -207,6 +208,17 @@ set(HAVE_EDITLINE "${EDITLINE_FOUND}") find_package(Termcap) set(HAVE_TERMCAP "${TERMCAP_FOUND}") +if(ICINGA2_WITH_OPENTELEMETRY) + # Newer Protobuf versions provide a CMake config package that we should prefer, since it implicitly + # links against all its dependencies (like absl, etc.) that would otherwise need to be linked manually. + # Thus, first try to find Protobuf in config mode and only fall back to module mode if that fails. + find_package(Protobuf CONFIG) + if(NOT Protobuf_FOUND) + find_package(Protobuf REQUIRED) + endif() + list(APPEND base_DEPS protobuf::libprotobuf-lite) +endif() + include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/lib ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/lib diff --git a/icinga-app/CMakeLists.txt b/icinga-app/CMakeLists.txt index a9358939548..88ad8bfdc2e 100644 --- a/icinga-app/CMakeLists.txt +++ b/icinga-app/CMakeLists.txt @@ -50,6 +50,10 @@ if(ICINGA2_WITH_NOTIFICATION) list(APPEND icinga_app_SOURCES $) endif() +if(ICINGA2_WITH_OPENTELEMETRY) + list(APPEND icinga_app_SOURCES $) +endif() + if(ICINGA2_WITH_PERFDATA) list(APPEND icinga_app_SOURCES $) endif() diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 2eb3d18324a..ebb0ce404a5 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -50,6 +50,10 @@ if(ICINGA2_WITH_NOTIFICATION) add_subdirectory(notification) endif() +if(ICINGA2_WITH_OPENTELEMETRY) + add_subdirectory(otel) +endif() + if(ICINGA2_WITH_PERFDATA) add_subdirectory(perfdata) endif() diff --git a/lib/otel/CMakeLists.txt b/lib/otel/CMakeLists.txt new file mode 100644 index 00000000000..dca82a91d86 --- /dev/null +++ b/lib/otel/CMakeLists.txt @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: 2026 Icinga GmbH +# SPDX-License-Identifier: GPL-3.0-or-later + +set(ICINGA2_OPENTELEMETRY_PROTOS_DIR "${icinga2_SOURCE_DIR}/third-party/opentelemetry-proto") +protobuf_generate( + LANGUAGE cpp + # According to the Protobuf docs[^1], the Protobuf compiler generates with the "LITE_RUNTIME" option much + # smaller code than the default optimze_for=SPEED option, which includes code for reflection, descriptors, + # and other features not needed by any part of the Icinga 2 OpenTelemetry integration. Thus, we use the "lite" + # option to generate code that only depend on the libprotobuf-lite instead of the full libprotobuf library. + # + # The only downside of using the lite runtime is that we won't be able to use any debugging capabilities + # provided by the full Protobuf runtime (like the DebugString() method on messages for easy printing, + # which heavily relies on reflection). + # + # [^1]: https://protobuf.dev/programming-guides/proto3/#options + PLUGIN_OPTIONS lite + OUT_VAR otel_PROTO_SRCS + IMPORT_DIRS "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}" + PROTOS + "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/collector/metrics/v1/metrics_service.proto" + "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/common/v1/common.proto" + "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/metrics/v1/metrics.proto" + "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/resource/v1/resource.proto" +) + +set(otel_SOURCES + otel.cpp otel.hpp + ${otel_PROTO_SRCS} +) + +add_library(otel OBJECT ${otel_SOURCES}) +add_dependencies(otel base remote) +target_include_directories(otel + SYSTEM PUBLIC + ${Protobuf_INCLUDE_DIRS} + ${CMAKE_CURRENT_BINARY_DIR} +) + +set_target_properties( + otel PROPERTIES + FOLDER Lib +) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp new file mode 100644 index 00000000000..7d9c55765e0 --- /dev/null +++ b/lib/otel/otel.cpp @@ -0,0 +1,649 @@ +// SPDX-FileCopyrightText: 2026 Icinga GmbH +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "otel/otel.hpp" +#include "base/application.hpp" +#include "base/defer.hpp" +#include "base/tcpsocket.hpp" +#include "base/tlsutility.hpp" +#include +#include +#include +#include + +using namespace icinga; + +namespace http = boost::beast::http; +namespace v1_metrics = opentelemetry::proto::metrics::v1; + +// The max buffer size used to batch Protobuf writes to Asio streams. +static constexpr std::size_t l_BufferSize = 64UL * 1024; +// The OpenTelemetry schema convention URL used in the exported metrics. +// See https://opentelemetry.io/docs/specs/semconv/ +static constexpr std::string_view l_OTelSchemaConv = "https://opentelemetry.io/schemas/1.39.0"; + +template std::size_t OTel::Record(Gauge&, int64_t, double, double, AttrsMap); +template std::size_t OTel::Record(Gauge&, double, double, double, AttrsMap); +template void OTel::SetAttribute(Attribute&, std::string_view&&, String&&); +template void OTel::SetAttribute(Attribute&, String&&, Value&); + +/** + * Calculate the exponential backoff duration for retrying failed exports or reconnections. + * + * This method calculates the backoff duration based on the number of retry attempts using an exponential + * backoff strategy as per OTel specifications. The backoff duration starts at a minimum value and doubles + * with each attempt, up to a maximum cap (30s). This helps to avoid overwhelming the OpenTelemetry backend + * with rapid retry attempts in case of transient errors. + * + * @param attempt The current retry attempt number (starting from 1). + * + * @return The calculated backoff duration in milliseconds. + */ +static constexpr std::chrono::milliseconds Backoff(uint64_t attempt) +{ + using namespace std::chrono; + + constexpr milliseconds MaxBackoffMs = seconds(30); + constexpr milliseconds MinBackoffMs = milliseconds(100); + + // 2^attempt may overflow, so we cap it to a safe value within the 64-bit range, + // which is sufficient to reach MaxBackoffMs from MinBackoffMs. + constexpr uint64_t maxSafeAttempt = 16; // 2^16 * 100ms = 6553.6s > 30s + auto exponential = MinBackoffMs * (1ULL << std::min(attempt, maxSafeAttempt)); + if (exponential >= MaxBackoffMs) { + return MaxBackoffMs; + } + return duration_cast(exponential); +} + +OTel::OTel(OTelConnInfo& connInfo): OTel{connInfo, IoEngine::Get().GetIoContext()} +{ +} + +OTel::OTel(OTelConnInfo& connInfo, boost::asio::io_context& io) + : m_ConnInfo{std::move(connInfo)}, + m_Strand{io}, + m_Export{io}, + m_RetryExportAndConnTimer{io}, + m_Exporting{false}, + m_Stopped{false} +{ + if (m_ConnInfo.EnableTls) { + m_TlsContext = MakeAsioSslContext(m_ConnInfo.TlsCrt, m_ConnInfo.TlsKey, m_ConnInfo.TlsCaCrt); + } +} + +void OTel::Start() +{ + if (m_Stopped.exchange(false)) { + ResetExporting(true); + } + + IoEngine::SpawnCoroutine(m_Strand, [this, keepAlive = ConstPtr(this)](boost::asio::yield_context yc) { + ExportLoop(yc); + }); +} + +/** + * Stop the OTel exporter and disconnect from the OpenTelemetry backend. + * + * This method blocks until the exporter has fully stopped and disconnected from the backend. + * It cancels any ongoing export operations and clears all its internal state, so that it can be + * safely restarted later if needed. + */ +void OTel::Stop() +{ + if (m_Stopped.exchange(true)) { + return; + } + + std::promise promise; + IoEngine::SpawnCoroutine(m_Strand, [this, &promise, keepAlive = ConstPtr(this)](boost::asio::yield_context& yc) { + m_Export.Set(); + m_RetryExportAndConnTimer.cancel(); + + if (!m_Stream) { + promise.set_value(); + return; + } + + std::visit([this, &yc](auto& stream) { + { + Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [&stream] { + boost::system::error_code ec; + stream->lowest_layer().cancel(ec); + }); + m_Export.WaitForClear(yc); + } + + using StreamType = std::decay_t; + if constexpr (std::is_same_v::Ptr>) { + stream->GracefulDisconnect(m_Strand, yc); + } else { + static_assert(std::is_same_v::Ptr>, "Unknown stream type"); + boost::system::error_code ec; + stream->lowest_layer().shutdown(AsioTcpStream::lowest_layer_type::shutdown_both, ec); + stream->lowest_layer().close(ec); + } + }, *m_Stream); + + Log(LogInformation, "OTelExporter") + << "Disconnected from OpenTelemetry backend."; + + m_Stream.reset(); + m_Request.reset(); + promise.set_value(); + }); + promise.get_future().wait(); +} + +/** + * Export the given OTel metrics request to the OpenTelemetry backend. + * + * This method initiates the export of the provided OTel metrics request to the configured + * OpenTelemetry backend. If an export is already in progress, it waits for the previous + * export to complete before proceeding with the new export request (blocking the caller). + * + * @param request The OTel metrics request to export. + */ +void OTel::Export(std::unique_ptr&& request) +{ + std::unique_lock lock(m_Mutex); + if (m_Exporting) { + Log(LogWarning, "OTelExporter") + << "Received export request while previous export is still in progress. Waiting for it to complete."; + + m_ExportCV.wait(lock, [this] { return m_Stopped || !m_Exporting; }); + if (m_Stopped) { + return; + } + } + m_Exporting = true; + lock.unlock(); + + // Access to m_Request is serialized via m_Strand, so we must post the actual export operation to it. + boost::asio::post(m_Strand, [this, keepAlive = ConstPtr(this), request = std::move(request)]() mutable { + m_Request = std::move(request); + m_Export.Set(); + }); +} + +/** + * Populate the standard OTel resource attributes in the given ResourceMetrics Protobuf object. + * + * This method populates the standard OTel resource attributes as per OTel specifications[^1][^2] + * into the provided ResourceMetrics Protobuf object. It sets attributes such as service name, + * instance ID, version, and telemetry SDK information. + * + * @param rm The ResourceMetrics Protobuf object to populate. + * + * [^1]: https://opentelemetry.io/docs/specs/semconv/resource/#telemetry-sdk + * [^2]: https://opentelemetry.io/docs/specs/semconv/resource/service/ + */ +void OTel::PopulateResourceAttrs(const std::unique_ptr& rm) +{ + using namespace std::string_view_literals; + + rm->set_schema_url(l_OTelSchemaConv.data()); + auto* resource = rm->mutable_resource(); + + auto* attr = resource->add_attributes(); + SetAttribute(*attr, "service.name"sv, "Icinga 2"sv); + + auto instanceID = Application::GetEnvironmentId(); + if (instanceID.IsEmpty()) { + instanceID = "unknown"; + } + attr = resource->add_attributes(); + SetAttribute(*attr, "service.instance.id"sv, std::move(instanceID)); + + attr = resource->add_attributes(); + SetAttribute(*attr, "service.version"sv, Application::GetAppVersion()); + + attr = resource->add_attributes(); + // We don't actually use OTel SDKs here, but to comply with OTel specs, we need to provide these attributes anyway. + SetAttribute(*attr, "telemetry.sdk.language"sv, "cpp"sv); + + attr = resource->add_attributes(); + SetAttribute(*attr, "telemetry.sdk.name"sv, "Icinga 2 OTel Integration"sv); + + attr = resource->add_attributes(); + SetAttribute(*attr, "telemetry.sdk.version"sv, Application::GetAppVersion()); + + auto* ism = rm->add_scope_metrics(); + ism->set_schema_url(l_OTelSchemaConv.data()); + ism->mutable_scope()->set_name("icinga2"); + ism->mutable_scope()->set_version(Application::GetAppVersion()); +} + +/** + * Establish a connection to the OpenTelemetry backend endpoint. + * + * In case of connection failures, it retries as per OTel spec[^1] with exponential backoff until a successful + * connection is established or the exporter is stopped. Therefore, @c m_Stream is not guaranteed to be valid + * after this method returns, so the caller must check it before using it. + * + * @param yc The Boost.Asio yield context for asynchronous operations. + * + * [^1]: https://opentelemetry.io/docs/specs/otlp/#otlphttp-connection + */ +void OTel::Connect(boost::asio::yield_context& yc) +{ + Log(LogInformation, "OTelExporter") + << "Connecting to OpenTelemetry backend on host '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "'."; + + for (uint64_t attempt = 1; !m_Stopped; ++attempt) { + try { + boost::asio::ip::tcp::socket socket{m_Strand.context()}; + icinga::Connect(socket, m_ConnInfo.Host, std::to_string(m_ConnInfo.Port), yc); + + if (m_ConnInfo.EnableTls) { + auto tlsStream = Shared::Make(m_Strand.context(), *m_TlsContext, m_ConnInfo.Host); + tlsStream->lowest_layer() = std::move(socket); + tlsStream->next_layer().async_handshake(AsioTlsStream::next_layer_type::client, yc); + + if (m_ConnInfo.VerifyPeerCertificate && !tlsStream->next_layer().IsVerifyOK()) { + BOOST_THROW_EXCEPTION(std::runtime_error( + "TLS certificate validation failed: " + tlsStream->next_layer().GetVerifyError() + )); + } + m_Stream = std::move(tlsStream); + } else { + auto tcpStream = Shared::Make(m_Strand.context()); + tcpStream->lowest_layer() = std::move(socket); + m_Stream = std::move(tcpStream); + } + + Log(LogInformation, "OTelExporter") + << "Successfully connected to OpenTelemetry backend."; + return; + } catch (const std::exception& ex) { + Log(m_Stopped ? LogDebug : LogCritical, "OTelExporter") + << "Cannot connect to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port + << "' (attempt #" << attempt << "): " << ex.what(); + + if (!m_Stopped) { + boost::system::error_code ec; + m_RetryExportAndConnTimer.expires_after(Backoff(attempt)); + m_RetryExportAndConnTimer.async_wait(yc[ec]); + } + } + } +} + +/** + * Main export loop for exporting OTel metrics to the configured backend. + * + * This method runs in a loop, waiting for new metrics to be available for export. In case of export failures, + * it retries the export as per OTel spec[^1] with exponential backoff until the export succeeds or the exporter + * is stopped. After a successful export, it clears the exported metrics from @c m_Request to make room for new metrics. + * + * @param yc The Asio yield context for asynchronous operations. + * + * [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes + */ +void OTel::ExportLoop(boost::asio::yield_context& yc) +{ + Defer cleanup{[this] { + m_Export.Clear(); + ResetExporting(true /* notify all */); + }}; + + namespace ch = std::chrono; + + while (!m_Stopped) { + m_Export.WaitForSet(yc); + if (!m_Stream) { + Connect(yc); + } + + for (uint64_t attempt = 1; m_Stream && !m_Stopped; ++attempt) { + try { + ExportImpl(yc); + m_Request.reset(); + m_Export.Clear(); + ResetExporting(false /* notify one */); + break; + } catch (const RetryableExportError& ex) { + ch::milliseconds retryAfter; + if (auto throttle = ex.Throttle(); throttle > 0ms) { + retryAfter = throttle; + } else { + retryAfter = Backoff(attempt); + } + + Log(LogWarning, "OTelExporter") + << "Failed to export metrics to OpenTelemetry backend (attempt #" << attempt << "). Retrying in " + << retryAfter.count() << "ms."; + + boost::system::error_code ec; + m_RetryExportAndConnTimer.expires_after(retryAfter); + m_RetryExportAndConnTimer.async_wait(yc[ec]); + } catch (const std::exception& ex) { + LogSeverity severity = LogCritical; + const auto* ser{dynamic_cast(&ex)}; + // Since we don't have a proper connection health check mechanism, we assume that certain errors + // indicate a broken connection and force a reconnect in those cases. For the `end_of_stream` case, + // we downgrade the log severity to debug level since this is a normal occurrence when using an OTEL + // collector compatible backend that don't honor keep-alive connections (e.g., OpenSearch Data Prepper). + if (m_Stopped || (ser && ser->code() == http::error::end_of_stream)) { + severity = LogDebug; + } + Log{severity, "OTelExporter", DiagnosticInformation(ex, false)}; + m_Stream.reset(); // Force reconnect on next export attempt. + } + } + } +} + +void OTel::ExportImpl(boost::asio::yield_context& yc) const +{ + AsioProtobufOutStream outputS{*m_Stream, m_ConnInfo, yc}; + [[maybe_unused]] auto serialized = m_Request->SerializeToZeroCopyStream(&outputS); + ASSERT(serialized); + // Must have completed chunk writing successfully, otherwise reading the response will hang forever. + VERIFY(outputS.WriterDone()); + + IncomingHttpResponse responseMsg{*m_Stream}; + responseMsg.Parse(yc); + + if (auto ct = responseMsg[http::field::content_type]; ct != "application/x-protobuf") { + if (responseMsg.result() == http::status::ok) { + // Some OpenTelemetry Collector compatible backends (e.g., Prometheus OTLP Receiver) respond with 200 OK + // but without the expected Protobuf content type. So, don't do anything here since the request succeeded. + return; + } + Log(LogWarning, "OTelExporter") + << "Unexpected Content-Type from OpenTelemetry backend '" << ct << "' (" << responseMsg.reason() << "):\n" + << responseMsg.body(); + } else if (responseMsg.result_int() >= 200 && responseMsg.result_int() <= 299) { + // We've got a valid Protobuf response, so we've to deserialize the body to check for partial success. + // See https://opentelemetry.io/docs/specs/otlp/#partial-success-1. + google::protobuf::Arena arena; + auto* response = MetricsResponse::default_instance().New(&arena); + [[maybe_unused]] auto deserialized = response->ParseFromString(responseMsg.body()); + ASSERT(deserialized); + + if (response->has_partial_success()) { + const auto& ps = response->partial_success(); + const auto& msg = ps.error_message(); + if (ps.rejected_data_points() > 0 || !msg.empty()) { + Log(LogWarning, "OTelExporter") + << "OpenTelemetry backend reported partial success: " << (msg.empty() ? "" : msg) + << " (" << ps.rejected_data_points() << " metric data points rejected)."; + } + } + } else if (IsRetryableExportError(responseMsg.result())) { + uint64_t throttleSeconds = 0; + if (auto throttle = responseMsg[http::field::retry_after]; !throttle.empty()) { + try { + throttleSeconds = boost::lexical_cast(throttle); + } catch (const std::exception& ex) { + Log(LogWarning, "OTelExporter") + << "Failed to parse 'Retry-After' header from OpenTelemetry backend response: " << ex.what(); + } + } + BOOST_THROW_EXCEPTION(RetryableExportError{throttleSeconds}); + } else { + Log(LogWarning, "OTelExporter") + << "OpenTelemetry backend responded with non-success and non-retryable status code " + << responseMsg.result_int() << " (" << responseMsg.reason() << ").\n" << responseMsg.body(); + } +} + +/** + * Reset the exporting state and notify waiters. + * + * This method resets the internal exporting state to indicate that no export is currently + * in progress. It then notifies either one or all waiters waiting for the export to complete, + * based on the @c notifyAll parameter. + * + * @param notifyAll If true, notifies all waiters; otherwise, notifies only one waiter. + */ +void OTel::ResetExporting(bool notifyAll) +{ + { + std::lock_guard lock(m_Mutex); + m_Exporting = false; + } + if (notifyAll) { + m_ExportCV.notify_all(); + } else { + m_ExportCV.notify_one(); + } +} + +/** + * Validate the given OTel metric name according to OTel naming conventions[^1]. + * Here's the ABNF definition for reference: + * @verbatim + * instrument-name = ALPHA 0*254 ("_" / "." / "-" / "/" / ALPHA / DIGIT) + * ALPHA = %x41-5A / %x61-7A; A-Z / a-z + * DIGIT = %x30-39 ; 0-9 + * @endverbatim + * + * @param name The metric name to validate. + * + * @throws std::invalid_argument if the metric name is invalid. + * + * [^1]: https://opentelemetry.io/docs/specs/otel/metrics/api/#instrument-name-syntax + */ +void OTel::ValidateName(std::string_view name) +{ + if (name.empty() || name.size() > 255) { + BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must be between 1 and 255 characters long.")); + } + + auto isAlpha = [](char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); }; + auto isDigit = [](char c) { return '0' <= c && c <= '9'; }; + for (std::size_t i = 0; i < name.size(); ++i) { + auto c = name[i]; + if (i == 0 && !isAlpha(c)) { + BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must start with an alphabetic character.")); + } + if (!isAlpha(c) && !isDigit(c) && c != '_' && c != '.' && c != '-' && c != '/') { + BOOST_THROW_EXCEPTION(std::invalid_argument( + "OTel instrument name contains invalid character '" + std::string(1, c) + "'." + )); + } + } +} + +/** + * Set the given OTel attribute key-value pair in the provided @c Attribute Protobuf object. + * + * This method sets the given key-value pair in the provided KeyValue Protobuf object according to + * OTel specifications[^1]. While the OTel specs[^2] allows a wider range of attr value types, we + * only support the most common/scalar types (Boolean, Number (double), and String) for simplicity. + * + * @param attr The OTel attribute Protobuf object to set the value for. + * @param key The attribute key to set. Must not be empty. + * @param value The Value object containing the value to set in the attribute. + * + * @throws std::invalid_argument if key is empty or if @c Value represents an unsupported attribute value type. + * + * [^1]: https://opentelemetry.io/docs/specs/otel/common/#attribute + * [^2]: https://opentelemetry.io/docs/specs/otel/common/#anyvalue + */ +template +void OTel::SetAttribute(Attribute& attr, Key&& key, AttrVal&& value) +{ + if (begin(key) == end(key)) { + BOOST_THROW_EXCEPTION(std::invalid_argument("OTel attribute key must not be empty.")); + } + + if constexpr (std::is_rvalue_reference_v && std::is_same_v, String>) { + attr.set_key(std::move(key.GetData())); + } else { + attr.set_key(std::string{std::forward(key)}); + } + + constexpr bool isRvalReference = std::is_rvalue_reference_v; + if constexpr (isRvalReference && std::is_same_v, String>) { + attr.mutable_value()->set_string_value(std::move(value.GetData())); + } else if constexpr (std::is_constructible_v) { + attr.mutable_value()->set_string_value(std::string{std::forward(value)}); + } else { + switch (value.GetType()) { + case ValueBoolean: + attr.mutable_value()->set_bool_value(value.template Get()); + break; + case ValueNumber: + attr.mutable_value()->set_double_value(value.template Get()); + break; + case ValueString: + if (isRvalReference) { + attr.mutable_value()->set_string_value(std::move(value.template Get().GetData())); + } else { + attr.mutable_value()->set_string_value(value.template Get().GetData()); + } + break; + default: + BOOST_THROW_EXCEPTION(std::invalid_argument( + "OTel attribute value must be of type Boolean, Number, or String, got '" + value.GetTypeName() + "'." + )); + } + } +} + +/** + * Record a data point in the given OTel Gauge metric stream with the provided value, timestamps, and attributes. + * + * This method adds a new data point to the provided Gauge Protobuf object with the given value, start and end + * timestamps, and a set of attributes. The value can be either an int64_t or a double, depending on the type + * of the Gauge. The timestamps are expected to be in seconds and will be converted to nanoseconds as required + * by OTel specifications. The attributes are provided as a map of key-value pairs and will be set in the data + * point according to OTel attribute specs. + * + * @tparam T The type of the data point value, which must be either int64_t or double. + * + * @param gauge The Gauge Protobuf object to record the data point in. + * @param data The value of the data point to record. + * @param start The start timestamp of the data point in seconds. + * @param end The end timestamp of the data point in seconds. + * @param attrs A map of attribute key-value pairs to set in the data point. + * + * @return The size in bytes of the recorded data point after serialization. + * + * @throws std::invalid_argument if any attribute key is empty or has an unsupported value type. + */ +template +std::size_t OTel::Record(Gauge& gauge, T data, double start, double end, AttrsMap attrs) +{ + namespace ch = std::chrono; + + auto* dataPoint = gauge.add_data_points(); + if constexpr (std::is_same_v) { + dataPoint->set_as_double(data); + } else { + dataPoint->set_as_int(data); + } + + dataPoint->set_start_time_unix_nano( + static_cast(ch::duration_cast(ch::duration(start)).count()) + ); + dataPoint->set_time_unix_nano( + static_cast(ch::duration_cast(ch::duration(end)).count()) + ); + + for (auto it{attrs.begin()}; it != attrs.end(); /* NOPE */) { + auto* attr = dataPoint->add_attributes(); + auto node = attrs.extract(it++); + SetAttribute(*attr, node.key(), node.mapped()); + } + return dataPoint->ByteSizeLong(); +} + +/** + * Determine if the given HTTP status code represents a retryable export error as per OTel specs[^1]. + * + * @param status The HTTP status code to check. + * + * @return true if the status code indicates a retryable error; false otherwise. + * + * [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes + */ +bool OTel::IsRetryableExportError(const http::status status) +{ + return status == http::status::too_many_requests + || status == http::status::bad_gateway + || status == http::status::service_unavailable + || status == http::status::gateway_timeout; +} + +AsioProtobufOutStream::AsioProtobufOutStream(const AsioTlsOrTcpStream& stream, const OTelConnInfo& connInfo, boost::asio::yield_context yc) + : m_Writer{stream}, m_YieldContext{std::move(yc)} +{ + m_Writer.method(http::verb::post); + m_Writer.target(connInfo.MetricsEndpoint); + m_Writer.set(http::field::host, connInfo.Host + ":" + std::to_string(connInfo.Port)); + m_Writer.set(http::field::content_type, "application/x-protobuf"); + if (!connInfo.BasicAuth.IsEmpty()) { + m_Writer.set(http::field::authorization, "Basic " + connInfo.BasicAuth); + } + m_Writer.StartStreaming(); +} + +bool AsioProtobufOutStream::Next(void** data, int* size) +{ + if (m_Buffered == l_BufferSize) { + Flush(); + } + // Prepare a new buffer segment that the Protobuf serializer can write into. + // The buffer size is fixed to l_BufferSize, and as seen above, we flush if the previous buffer + // segment was fully used (which is always the case on each Next call after the initial one), so + // we'll end up reusing the same memory region for each Next call because when we flush, we also + // consume the committed data, and that region becomes writable again. + auto buf = m_Writer.Prepare(l_BufferSize - m_Buffered); + *data = buf.data(); + *size = static_cast(l_BufferSize); + m_Buffered = l_BufferSize; + return true; +} + +void AsioProtobufOutStream::BackUp(int count) +{ + // Make sure we've not already finalized the HTTP body because BackUp + // is supposed to be called only after a preceding (final) Next call. + ASSERT(!m_Writer.Done()); + ASSERT(static_cast(count) <= m_Buffered); + ASSERT(m_Buffered == l_BufferSize); + // If the last prepared buffer segment was not fully used, we need to adjust the buffered size, + // so that we don't commit unused memory regions with the below Flush() call. If count is zero, + // this adjustment is a no-op, and indicates that the entire buffer was used and there won't be + // any subsequent Next calls anymore (i.e., the Protobuf serialization is complete). + m_Buffered -= count; + Flush(true); +} + +int64_t AsioProtobufOutStream::ByteCount() const +{ + return m_Pos + static_cast(m_Buffered); +} + +/** + * Flush any buffered data to the underlying Asio stream. + * + * If the `finish` parameter is set to true, it indicates that no more data will + * be buffered/generated, and the HTTP body will be finalized accordingly. + * + * @param finish Whether this is the final flush operation. + */ +void AsioProtobufOutStream::Flush(bool finish) +{ + ASSERT(m_Buffered > 0 || finish); + m_Writer.Commit(m_Buffered); + m_Writer.Flush(m_YieldContext, finish); + m_Pos += static_cast(m_Buffered); + m_Buffered = 0; +} + +/** + * Check if the underlying HTTP request writer has completed writing. + * + * @return true if the writer has finished writing; false otherwise. + */ +bool AsioProtobufOutStream::WriterDone() +{ + return m_Writer.Done(); +} diff --git a/lib/otel/otel.hpp b/lib/otel/otel.hpp new file mode 100644 index 00000000000..44ef0df3ebc --- /dev/null +++ b/lib/otel/otel.hpp @@ -0,0 +1,190 @@ +// SPDX-FileCopyrightText: 2026 Icinga GmbH +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "base/io-engine.hpp" +#include "base/tlsstream.hpp" +#include "base/shared.hpp" +#include "base/shared-object.hpp" +#include "base/string.hpp" +#include "remote/httpmessage.hpp" +#include "otel/opentelemetry/proto/collector/metrics/v1/metrics_service.pb.h" +#include +#include +#include +#include +#include +#include +#include + +namespace icinga +{ + +/** + * Connection parameters for connecting to an OpenTelemetry collector endpoint. + * + * @ingroup otel + */ +struct OTelConnInfo +{ + bool EnableTls{false}; + bool VerifyPeerCertificate{true}; + int Port; + String Host; + String TlsCaCrt; + String TlsCrt; + String TlsKey; + String MetricsEndpoint; + String BasicAuth; // Base64-encoded "username:password" string for basic authentication. +}; + +/** + * OTel implements the OpenTelemetry Protocol (OTLP) exporter. + * + * This class manages the connection to an OpenTelemetry collector or compatible backend and + * handles exporting (currently only metrics) in OTLP Protobuf format over HTTP. It supports + * TLS connections, basic authentication, and implements retry logic for transient errors as + * per OTel specs. + * + * @ingroup otel + */ +class OTel : public SharedObject +{ +public: + DECLARE_PTR_TYPEDEFS(OTel); + + // Protobuf request and response types for exporting metrics. + using MetricsRequest = opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceRequest; + using MetricsResponse = opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceResponse; + // Protobuf attribute type used for OTel resource and data point attributes. + using Attribute = opentelemetry::proto::common::v1::KeyValue; + // Protobuf Gauge type used for representing OTel Gauge metric streams. + using Gauge = opentelemetry::proto::metrics::v1::Gauge; + + /** + * Represents a collection of OTel attributes[^1] as key-value pairs. + * + * [^1]: https://opentelemetry.io/docs/specs/otel/common/#attribute + */ + using AttrsMap = std::map; + + explicit OTel(OTelConnInfo& connInfo); + + void Start(); + void Stop(); + void Export(std::unique_ptr&& request); + + bool Exporting() const + { + std::lock_guard lock(m_Mutex); + return m_Exporting; + } + + bool Stopped() const { return m_Stopped.load(); } + + static void PopulateResourceAttrs(const std::unique_ptr& rm); + static void ValidateName(std::string_view name); + template && ( + std::is_same_v, Value> || + std::is_constructible_v + ) + >> + static void SetAttribute(Attribute& attr, Key&& key, AttrVal&& value); + static bool IsRetryableExportError(boost::beast::http::status status); + + template, int64_t> || std::is_same_v, double>> + > + [[nodiscard]] static std::size_t Record(Gauge& gauge, T data, double start, double end, AttrsMap attrs); + +private: + OTel(OTelConnInfo& connInfo, boost::asio::io_context& io); + + void Connect(boost::asio::yield_context& yc); + void ExportLoop(boost::asio::yield_context& yc); + void ExportImpl(boost::asio::yield_context& yc) const; + + void ResetExporting(bool notifyAll = false); + + const OTelConnInfo m_ConnInfo; + std::optional m_Stream; + Shared::Ptr m_TlsContext; + boost::asio::io_context::strand m_Strand; + + AsioDualEvent m_Export; // Event to signal when a new export request is available. + // Timer for scheduling retries of failed exports and reconnection attempts. + boost::asio::steady_timer m_RetryExportAndConnTimer; + + // Mutex and condition variable for synchronizing concurrent export requests. + mutable std::mutex m_Mutex; + std::condition_variable m_ExportCV; + std::unique_ptr m_Request; // Current export request being processed (if any). + bool m_Exporting; // Whether an export operation is in progress. + std::atomic_bool m_Stopped; // Whether someone has requested to stop the exporter. +}; +extern template std::size_t OTel::Record(Gauge&, int64_t, double, double, AttrsMap); +extern template std::size_t OTel::Record(Gauge&, double, double, double, AttrsMap); +extern template void OTel::SetAttribute(Attribute&, std::string_view&&, String&&); +extern template void OTel::SetAttribute(Attribute&, String&&, Value&); + +/** + * A zero-copy output stream that writes directly to an Asio [TLS] stream. + * + * This class implements the @c google::protobuf::io::ZeroCopyOutputStream interface, allowing Protobuf + * serializers to write data directly to an Asio [TLS] stream without unnecessary copying of data. It + * doesn't buffer data internally, but instead writes it in chunks to the underlying stream using an HTTP + * request writer (@c HttpRequestWriter) in a Protobuf binary format. It is not safe to be reused across + * multiple export calls. + * + * @ingroup otel + */ +class AsioProtobufOutStream final : public google::protobuf::io::ZeroCopyOutputStream +{ +public: + AsioProtobufOutStream(const AsioTlsOrTcpStream& stream, const OTelConnInfo& connInfo, boost::asio::yield_context yc); + + bool Next(void** data, int* size) override; + void BackUp(int count) override; + int64_t ByteCount() const override; + + bool WriterDone(); + +private: + void Flush(bool finish = false); + + int64_t m_Pos{0}; // Monotonically increasing byte position in the stream (excluding m_Buffered bytes). + std::size_t m_Buffered{0}; // Number of uncommitted bytes currently buffered. + OutgoingHttpRequest m_Writer; + boost::asio::yield_context m_YieldContext; // Yield context for async operations. +}; + +/** + * Exception class representing a retryable export error. + * + * This exception is thrown when an export attempt to an OpenTelemetry collector fails + * with a retryable error status. It carries an optional HTTP throttle[^1] duration indicating + * how long to wait before retrying the export. + * + * [^1]: https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling + * + * @ingroup otel + */ +struct RetryableExportError : std::exception +{ + explicit RetryableExportError(uint64_t throttle): m_Throttle{throttle} + { + } + + [[nodiscard]] std::chrono::seconds Throttle() const { return m_Throttle; } + const char* what() const noexcept override + { + return "OTel::RetryableExportError()"; + } + +private: + std::chrono::seconds m_Throttle; +}; + +} // namespace icinga diff --git a/lib/remote/httpmessage.cpp b/lib/remote/httpmessage.cpp index 7641e75ab23..221c8945ed4 100644 --- a/lib/remote/httpmessage.cpp +++ b/lib/remote/httpmessage.cpp @@ -93,6 +93,14 @@ void IncomingHttpMessage::ParseBody( Base::body() = std::move(m_Parser.release().body()); } +template +void IncomingHttpMessage::Parse(boost::asio::yield_context& yc) +{ + boost::beast::flat_buffer buf; + ParseHeader(buf, yc); + ParseBody(buf, yc); +} + HttpApiRequest::HttpApiRequest(Shared::Ptr stream) : IncomingHttpMessage(std::move(stream)) { } diff --git a/lib/remote/httpmessage.hpp b/lib/remote/httpmessage.hpp index 30f11442efd..80e61de392a 100644 --- a/lib/remote/httpmessage.hpp +++ b/lib/remote/httpmessage.hpp @@ -178,6 +178,14 @@ class IncomingHttpMessage : public boost::beast::http::message */ void ParseBody(boost::beast::flat_buffer& buf, boost::asio::yield_context yc); + /** + * Parse the entire message (header and body) using the internal parser object. + * + * This is just a convenience wrapper around @c ParseHeader() and @c ParseBody() that consecutively calls + * both of them. It can be used when you don't need to do anything with the header before parsing the body. + */ + void Parse(boost::asio::yield_context& yc); + ParserType& Parser() { return m_Parser; } private: @@ -251,6 +259,13 @@ class OutgoingHttpMessage : public boost::beast::http::message [[nodiscard]] bool HasSerializationStarted() const { return m_SerializationStarted; } + /** + * Check if the message has been fully serialized. + * + * @return true if the message is fully serialized; false otherwise. + */ + [[nodiscard]] bool Done() { return m_Serializer.is_done(); } + /** * Sends the contents of a file. * diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d1f0906cb9a..c20e0aaf54c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,6 +68,10 @@ if(ICINGA2_WITH_NOTIFICATION) list(APPEND types_test_SOURCES $) endif() +if(ICINGA2_WITH_OPENTELEMETRY) + list(APPEND types_test_SOURCES $) +endif() + if(ICINGA2_WITH_PERFDATA) list(APPEND types_test_SOURCES $) endif() From 18e5b9aa8ab7a00d744b1676c8d81ad4d9ac587d Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 14 Jan 2026 13:35:10 +0100 Subject: [PATCH 04/20] CMake: provide newer `FindProtobuf.cmake` for old CMake version This module is copied from CMake's official module repository[^1] and contains only minor changes as outlined below. ```diff --- a/third-party/cmake/protobuf/FindProtobuf.cmake +++ b/third-party/cmake/protobuf/FindProtobuf.cmake @@ -218,9 +218,6 @@ Example: GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc) #]=======================================================================] -cmake_policy(PUSH) -cmake_policy(SET CMP0159 NEW) # file(STRINGS) with REGEX updates CMAKE_MATCH_ - function(protobuf_generate) set(_options APPEND_PATH DESCRIPTORS) set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR PLUGIN PLUGIN_OPTIONS DEPENDENCIES) @@ -503,7 +500,7 @@ if( Protobuf_USE_STATIC_LIBS ) endif() endif() -include(${CMAKE_CURRENT_LIST_DIR}/SelectLibraryConfigurations.cmake) +include(SelectLibraryConfigurations) # Internal function: search for normal library as well as a debug one # if the debug one is specified also include debug/optimized keywords @@ -768,7 +765,7 @@ if(Protobuf_INCLUDE_DIR) endif() endif() -include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake) +include(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(Protobuf REQUIRED_VARS Protobuf_LIBRARIES Protobuf_INCLUDE_DIR VERSION_VAR Protobuf_VERSION @@ -805,5 +802,3 @@ foreach(Camel string(TOUPPER ${Camel} UPPER) set(${UPPER} ${${Camel}}) endforeach() - -cmake_policy(POP) ``` [^1]: https://github.com/Kitware/CMake/blob/v3.31.0/Modules/FindProtobuf.cmake --- CMakeLists.txt | 6 + third-party/cmake/protobuf/FindProtobuf.cmake | 804 ++++++++++++++++++ 2 files changed, 810 insertions(+) create mode 100644 third-party/cmake/protobuf/FindProtobuf.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index ed657d615bd..96cfedab486 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,12 @@ if(ICINGA2_WITH_OPENTELEMETRY) # Thus, first try to find Protobuf in config mode and only fall back to module mode if that fails. find_package(Protobuf CONFIG) if(NOT Protobuf_FOUND) + # FindProtobuf.cmake in CMake versions < 3.31.0 is just broken and mixes up the Protobuf output directories + # and it doesn't even support to pass any PLUGIN_OPTIONS like "lite" to the protobuf_generate() function in + # order to generate code for the lite runtime without having to modify the proto files directly. + if(CMAKE_VERSION VERSION_LESS 3.31.0) + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/third-party/cmake/protobuf") + endif() find_package(Protobuf REQUIRED) endif() list(APPEND base_DEPS protobuf::libprotobuf-lite) diff --git a/third-party/cmake/protobuf/FindProtobuf.cmake b/third-party/cmake/protobuf/FindProtobuf.cmake new file mode 100644 index 00000000000..ce103be1c61 --- /dev/null +++ b/third-party/cmake/protobuf/FindProtobuf.cmake @@ -0,0 +1,804 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindProtobuf +------------ + +Locate and configure the Google Protocol Buffers library. + +.. versionadded:: 3.6 + Support for :command:`find_package` version checks. + +.. versionchanged:: 3.6 + All input and output variables use the ``Protobuf_`` prefix. + Variables with ``PROTOBUF_`` prefix are still supported for compatibility. + +The following variables can be set and are optional: + +``Protobuf_SRC_ROOT_FOLDER`` + When compiling with MSVC, if this cache variable is set + the protobuf-default VS project build locations + (vsprojects/Debug and vsprojects/Release + or vsprojects/x64/Debug and vsprojects/x64/Release) + will be searched for libraries and binaries. +``Protobuf_IMPORT_DIRS`` + List of additional directories to be searched for + imported .proto files. +``Protobuf_DEBUG`` + .. versionadded:: 3.6 + + Show debug messages. +``Protobuf_USE_STATIC_LIBS`` + .. versionadded:: 3.9 + + Set to ON to force the use of the static libraries. + Default is OFF. + +Defines the following variables: + +``Protobuf_FOUND`` + Found the Google Protocol Buffers library + (libprotobuf & header files) +``Protobuf_VERSION`` + .. versionadded:: 3.6 + + Version of package found. +``Protobuf_INCLUDE_DIRS`` + Include directories for Google Protocol Buffers +``Protobuf_LIBRARIES`` + The protobuf libraries +``Protobuf_PROTOC_LIBRARIES`` + The protoc libraries +``Protobuf_LITE_LIBRARIES`` + The protobuf-lite libraries + +.. versionadded:: 3.9 + The following :prop_tgt:`IMPORTED` targets are also defined: + +``protobuf::libprotobuf`` + The protobuf library. +``protobuf::libprotobuf-lite`` + The protobuf lite library. +``protobuf::libprotoc`` + The protoc library. +``protobuf::protoc`` + .. versionadded:: 3.10 + The protoc compiler. + +The following cache variables are also available to set or use: + +``Protobuf_LIBRARY`` + The protobuf library +``Protobuf_PROTOC_LIBRARY`` + The protoc library +``Protobuf_INCLUDE_DIR`` + The include directory for protocol buffers +``Protobuf_PROTOC_EXECUTABLE`` + The protoc compiler +``Protobuf_LIBRARY_DEBUG`` + The protobuf library (debug) +``Protobuf_PROTOC_LIBRARY_DEBUG`` + The protoc library (debug) +``Protobuf_LITE_LIBRARY`` + The protobuf lite library +``Protobuf_LITE_LIBRARY_DEBUG`` + The protobuf lite library (debug) + +Example: + +.. code-block:: cmake + + find_package(Protobuf REQUIRED) + include_directories(${Protobuf_INCLUDE_DIRS}) + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS foo.proto) + protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS EXPORT_MACRO DLL_EXPORT foo.proto) + protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS DESCRIPTORS PROTO_DESCS foo.proto) + protobuf_generate_python(PROTO_PY foo.proto) + add_executable(bar bar.cc ${PROTO_SRCS} ${PROTO_HDRS}) + target_link_libraries(bar ${Protobuf_LIBRARIES}) + +.. note:: + The ``protobuf_generate_cpp`` and ``protobuf_generate_python`` + functions and :command:`add_executable` or :command:`add_library` + calls only work properly within the same directory. + +.. command:: protobuf_generate_cpp + + Add custom commands to process ``.proto`` files to C++:: + + protobuf_generate_cpp ( + [DESCRIPTORS ] [EXPORT_MACRO ] [...]) + + ``SRCS`` + Variable to define with autogenerated source files + ``HDRS`` + Variable to define with autogenerated header files + ``DESCRIPTORS`` + .. versionadded:: 3.10 + Variable to define with autogenerated descriptor files, if requested. + ``EXPORT_MACRO`` + is a macro which should expand to ``__declspec(dllexport)`` or + ``__declspec(dllimport)`` depending on what is being compiled. + ``ARGN`` + ``.proto`` files + +.. command:: protobuf_generate_python + + .. versionadded:: 3.4 + + Add custom commands to process ``.proto`` files to Python:: + + protobuf_generate_python ( [...]) + + ``PY`` + Variable to define with autogenerated Python files + ``ARGN`` + ``.proto`` files + +.. command:: protobuf_generate + + .. versionadded:: 3.13 + + Automatically generate source files from ``.proto`` schema files at build time:: + + protobuf_generate ( + TARGET + [LANGUAGE ] + [OUT_VAR ] + [EXPORT_MACRO ] + [PROTOC_OUT_DIR ] + [PLUGIN ] + [PLUGIN_OPTIONS ] + [DEPENDENCIES ] + [IMPORT_DIRS ] + [GENERATE_EXTENSIONS ] + [PROTOC_OPTIONS ] + [APPEND_PATH]) + + ``APPEND_PATH`` + A flag that causes the base path of all proto schema files to be added to + ``IMPORT_DIRS``. + ``LANGUAGE`` + A single value: cpp or python. Determines what kind of source files are + being generated. Defaults to cpp. + ``OUT_VAR`` + Name of a CMake variable that will be filled with the paths to the generated + source files. + ``EXPORT_MACRO`` + Name of a macro that is applied to all generated Protobuf message classes + and extern variables. It can, for example, be used to declare DLL exports. + ``PROTOC_OUT_DIR`` + Output directory of generated source files. Defaults to ``CMAKE_CURRENT_BINARY_DIR``. + ``PLUGIN`` + .. versionadded:: 3.21 + + An optional plugin executable. This could, for example, be the path to + ``grpc_cpp_plugin``. + ``PLUGIN_OPTIONS`` + .. versionadded:: 3.28 + + Additional options provided to the plugin, such as ``generate_mock_code=true`` + for the gRPC cpp plugin. + ``DEPENDENCIES`` + .. versionadded:: 3.28 + + Arguments forwarded to the ``DEPENDS`` of the underlying ``add_custom_command`` + invocation. + ``TARGET`` + CMake target that will have the generated files added as sources. + ``PROTOS`` + List of proto schema files. If omitted, then every source file ending in *proto* of ``TARGET`` will be used. + ``IMPORT_DIRS`` + A common parent directory for the schema files. For example, if the schema file is + ``proto/helloworld/helloworld.proto`` and the import directory ``proto/`` then the + generated files are ``${PROTOC_OUT_DIR}/helloworld/helloworld.pb.h`` and + ``${PROTOC_OUT_DIR}/helloworld/helloworld.pb.cc``. + ``GENERATE_EXTENSIONS`` + If LANGUAGE is omitted then this must be set to the extensions that protoc generates. + ``PROTOC_OPTIONS`` + .. versionadded:: 3.28 + + Additional arguments that are forwarded to protoc. + + Example:: + + find_package(gRPC CONFIG REQUIRED) + find_package(Protobuf REQUIRED) + add_library(ProtoTest Test.proto) + target_link_libraries(ProtoTest PUBLIC gRPC::grpc++) + protobuf_generate(TARGET ProtoTest) + protobuf_generate( + TARGET ProtoTest + LANGUAGE grpc + PLUGIN protoc-gen-grpc=$ + PLUGIN_OPTIONS generate_mock_code=true + GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc) +#]=======================================================================] + +function(protobuf_generate) + set(_options APPEND_PATH DESCRIPTORS) + set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR PLUGIN PLUGIN_OPTIONS DEPENDENCIES) + if(COMMAND target_sources) + list(APPEND _singleargs TARGET) + endif() + set(_multiargs PROTOS IMPORT_DIRS GENERATE_EXTENSIONS PROTOC_OPTIONS) + + cmake_parse_arguments(protobuf_generate "${_options}" "${_singleargs}" "${_multiargs}" "${ARGN}") + + if(NOT protobuf_generate_PROTOS AND NOT protobuf_generate_TARGET) + message(SEND_ERROR "Error: protobuf_generate called without any targets or source files") + return() + endif() + + if(NOT protobuf_generate_OUT_VAR AND NOT protobuf_generate_TARGET) + message(SEND_ERROR "Error: protobuf_generate called without a target or output variable") + return() + endif() + + if(NOT protobuf_generate_LANGUAGE) + set(protobuf_generate_LANGUAGE cpp) + endif() + string(TOLOWER ${protobuf_generate_LANGUAGE} protobuf_generate_LANGUAGE) + + if(NOT protobuf_generate_PROTOC_OUT_DIR) + set(protobuf_generate_PROTOC_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + if(protobuf_generate_EXPORT_MACRO AND protobuf_generate_LANGUAGE STREQUAL cpp) + set(_dll_export_decl "dllexport_decl=${protobuf_generate_EXPORT_MACRO}") + endif() + + foreach(_option ${_dll_export_decl} ${protobuf_generate_PLUGIN_OPTIONS}) + # append comma - not using CMake lists and string replacement as users + # might have semicolons in options + if(_plugin_options) + set( _plugin_options "${_plugin_options},") + endif() + set(_plugin_options "${_plugin_options}${_option}") + endforeach() + + if(protobuf_generate_PLUGIN) + set(_plugin "--plugin=${protobuf_generate_PLUGIN}") + endif() + + if(NOT protobuf_generate_GENERATE_EXTENSIONS) + if(protobuf_generate_LANGUAGE STREQUAL cpp) + set(protobuf_generate_GENERATE_EXTENSIONS .pb.h .pb.cc) + elseif(protobuf_generate_LANGUAGE STREQUAL python) + set(protobuf_generate_GENERATE_EXTENSIONS _pb2.py) + else() + message(SEND_ERROR "Error: protobuf_generate given unknown Language ${LANGUAGE}, please provide a value for GENERATE_EXTENSIONS") + return() + endif() + endif() + + if(protobuf_generate_TARGET) + get_target_property(_source_list ${protobuf_generate_TARGET} SOURCES) + foreach(_file ${_source_list}) + if(_file MATCHES "proto$") + list(APPEND protobuf_generate_PROTOS ${_file}) + endif() + endforeach() + endif() + + if(NOT protobuf_generate_PROTOS) + message(SEND_ERROR "Error: protobuf_generate could not find any .proto files") + return() + endif() + + if(NOT TARGET protobuf::protoc) + message(SEND_ERROR "protoc executable not found. " + "Please define the Protobuf_PROTOC_EXECUTABLE variable or ensure that protoc is in CMake's search path.") + return() + endif() + + if(protobuf_generate_APPEND_PATH) + # Create an include path for each file specified + foreach(_file ${protobuf_generate_PROTOS}) + get_filename_component(_abs_file ${_file} ABSOLUTE) + get_filename_component(_abs_dir ${_abs_file} DIRECTORY) + list(FIND _protobuf_include_path ${_abs_dir} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${_abs_dir}) + endif() + endforeach() + endif() + + foreach(DIR ${protobuf_generate_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + + if(NOT protobuf_generate_APPEND_PATH) + list(APPEND _protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + set(_generated_srcs_all) + foreach(_proto ${protobuf_generate_PROTOS}) + get_filename_component(_abs_file ${_proto} ABSOLUTE) + get_filename_component(_abs_dir ${_abs_file} DIRECTORY) + get_filename_component(_basename ${_proto} NAME_WLE) + file(RELATIVE_PATH _rel_dir ${CMAKE_CURRENT_SOURCE_DIR} ${_abs_dir}) + + set(_possible_rel_dir) + if (NOT protobuf_generate_APPEND_PATH) + foreach(DIR ${_protobuf_include_path}) + if(NOT DIR STREQUAL "-I") + file(RELATIVE_PATH _rel_dir ${DIR} ${_abs_dir}) + if(_rel_dir STREQUAL _abs_dir) + continue() + endif() + string(FIND "${_rel_dir}" "../" _is_in_parent_folder) + if (NOT ${_is_in_parent_folder} EQUAL 0) + break() + endif() + endif() + endforeach() + set(_possible_rel_dir ${_rel_dir}/) + endif() + + set(_generated_srcs) + foreach(_ext ${protobuf_generate_GENERATE_EXTENSIONS}) + list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + endforeach() + + if(protobuf_generate_DESCRIPTORS AND protobuf_generate_LANGUAGE STREQUAL cpp) + set(_descriptor_file "${CMAKE_CURRENT_BINARY_DIR}/${_basename}.desc") + set(_dll_desc_out "--descriptor_set_out=${_descriptor_file}") + list(APPEND _generated_srcs ${_descriptor_file}) + endif() + list(APPEND _generated_srcs_all ${_generated_srcs}) + + set(_comment "Running ${protobuf_generate_LANGUAGE} protocol buffer compiler on ${_proto}") + if(protobuf_generate_PROTOC_OPTIONS) + set(_comment "${_comment}, protoc-options: ${protobuf_generate_PROTOC_OPTIONS}") + endif() + if(_plugin_options) + set(_comment "${_comment}, plugin-options: ${_plugin_options}") + endif() + + add_custom_command( + OUTPUT ${_generated_srcs} + COMMAND protobuf::protoc + ARGS ${protobuf_generate_PROTOC_OPTIONS} --${protobuf_generate_LANGUAGE}_out ${_plugin_options}:${protobuf_generate_PROTOC_OUT_DIR} ${_plugin} ${_dll_desc_out} ${_protobuf_include_path} ${_abs_file} + DEPENDS ${_abs_file} protobuf::protoc ${protobuf_generate_DEPENDENCIES} + COMMENT ${_comment} + VERBATIM ) + endforeach() + + set_source_files_properties(${_generated_srcs_all} PROPERTIES GENERATED TRUE) + if(protobuf_generate_OUT_VAR) + set(${protobuf_generate_OUT_VAR} ${_generated_srcs_all} PARENT_SCOPE) + endif() + if(protobuf_generate_TARGET) + target_sources(${protobuf_generate_TARGET} PRIVATE ${_generated_srcs_all}) + endif() +endfunction() + +function(PROTOBUF_GENERATE_CPP SRCS HDRS) + cmake_parse_arguments(protobuf_generate_cpp "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN}) + + set(_proto_files "${protobuf_generate_cpp_UNPARSED_ARGUMENTS}") + if(NOT _proto_files) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(_append_arg APPEND_PATH) + endif() + + if(protobuf_generate_cpp_DESCRIPTORS) + set(_descriptors DESCRIPTORS) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) + endif() + + set(_outvar) + protobuf_generate(${_append_arg} ${_descriptors} LANGUAGE cpp EXPORT_MACRO ${protobuf_generate_cpp_EXPORT_MACRO} OUT_VAR _outvar ${_import_arg} PROTOS ${_proto_files}) + + set(${SRCS}) + set(${HDRS}) + if(protobuf_generate_cpp_DESCRIPTORS) + set(${protobuf_generate_cpp_DESCRIPTORS}) + endif() + + foreach(_file ${_outvar}) + if(_file MATCHES "cc$") + list(APPEND ${SRCS} ${_file}) + elseif(_file MATCHES "desc$") + list(APPEND ${protobuf_generate_cpp_DESCRIPTORS} ${_file}) + else() + list(APPEND ${HDRS} ${_file}) + endif() + endforeach() + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) + if(protobuf_generate_cpp_DESCRIPTORS) + set(${protobuf_generate_cpp_DESCRIPTORS} "${${protobuf_generate_cpp_DESCRIPTORS}}" PARENT_SCOPE) + endif() +endfunction() + +function(PROTOBUF_GENERATE_PYTHON SRCS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(_append_arg APPEND_PATH) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) + endif() + + set(_outvar) + protobuf_generate(${_append_arg} LANGUAGE python OUT_VAR _outvar ${_import_arg} PROTOS ${ARGN}) + set(${SRCS} ${_outvar} PARENT_SCOPE) +endfunction() + + +if(Protobuf_DEBUG) + # Output some of their choices + message(STATUS "[ ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE} ] " + "Protobuf_USE_STATIC_LIBS = ${Protobuf_USE_STATIC_LIBS}") +endif() + + +# Backwards compatibility +# Define camel case versions of input variables +foreach(UPPER + PROTOBUF_SRC_ROOT_FOLDER + PROTOBUF_IMPORT_DIRS + PROTOBUF_DEBUG + PROTOBUF_LIBRARY + PROTOBUF_PROTOC_LIBRARY + PROTOBUF_INCLUDE_DIR + PROTOBUF_PROTOC_EXECUTABLE + PROTOBUF_LIBRARY_DEBUG + PROTOBUF_PROTOC_LIBRARY_DEBUG + PROTOBUF_LITE_LIBRARY + PROTOBUF_LITE_LIBRARY_DEBUG + ) + if (DEFINED ${UPPER}) + string(REPLACE "PROTOBUF_" "Protobuf_" Camel ${UPPER}) + if (NOT DEFINED ${Camel}) + set(${Camel} ${${UPPER}}) + endif() + endif() +endforeach() + +if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(_PROTOBUF_ARCH_DIR x64/) +endif() + + +# Support preference of static libs by adjusting CMAKE_FIND_LIBRARY_SUFFIXES +if( Protobuf_USE_STATIC_LIBS ) + set( _protobuf_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + if(WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else() + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ) + endif() +endif() + +include(SelectLibraryConfigurations) + +# Internal function: search for normal library as well as a debug one +# if the debug one is specified also include debug/optimized keywords +# in *_LIBRARIES variable +function(_protobuf_find_libraries name filename) + if(${name}_LIBRARIES) + # Use result recorded by a previous call. + return() + elseif(${name}_LIBRARY) + # Honor cache entry used by CMake 3.5 and lower. + set(${name}_LIBRARIES "${${name}_LIBRARY}" PARENT_SCOPE) + else() + find_library(${name}_LIBRARY_RELEASE + NAMES ${filename} + NAMES_PER_DIR + PATHS ${Protobuf_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Release) + mark_as_advanced(${name}_LIBRARY_RELEASE) + + find_library(${name}_LIBRARY_DEBUG + NAMES ${filename}d ${filename} + NAMES_PER_DIR + PATHS ${Protobuf_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Debug) + mark_as_advanced(${name}_LIBRARY_DEBUG) + + select_library_configurations(${name}) + + if(UNIX AND Threads_FOUND AND ${name}_LIBRARY) + list(APPEND ${name}_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) + endif() + + set(${name}_LIBRARY "${${name}_LIBRARY}" PARENT_SCOPE) + set(${name}_LIBRARIES "${${name}_LIBRARIES}" PARENT_SCOPE) + endif() +endfunction() + +# +# Main. +# + +# By default have PROTOBUF_GENERATE_CPP macro pass -I to protoc +# for each directory where a proto file is referenced. +if(NOT DEFINED PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE) +endif() + + +# Google's provided vcproj files generate libraries with a "lib" +# prefix on Windows +if(MSVC) + set(Protobuf_ORIG_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}") + set(CMAKE_FIND_LIBRARY_PREFIXES "lib" "") + + find_path(Protobuf_SRC_ROOT_FOLDER protobuf.pc.in) +endif() + +if(UNIX) + # Protobuf headers may depend on threading. + find_package(Threads QUIET) +endif() + +# The Protobuf library +_protobuf_find_libraries(Protobuf protobuf) +#DOC "The Google Protocol Buffers RELEASE Library" + +_protobuf_find_libraries(Protobuf_LITE protobuf-lite) + +# The Protobuf Protoc Library +_protobuf_find_libraries(Protobuf_PROTOC protoc) + +# Restore original find library prefixes +if(MSVC) + set(CMAKE_FIND_LIBRARY_PREFIXES "${Protobuf_ORIG_FIND_LIBRARY_PREFIXES}") +endif() + +# Find the include directory +find_path(Protobuf_INCLUDE_DIR + google/protobuf/service.h + PATHS ${Protobuf_SRC_ROOT_FOLDER}/src +) +mark_as_advanced(Protobuf_INCLUDE_DIR) + +# Find the protoc Executable +find_program(Protobuf_PROTOC_EXECUTABLE + NAMES protoc + DOC "The Google Protocol Buffers Compiler" + PATHS + ${Protobuf_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Release + ${Protobuf_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Debug +) +mark_as_advanced(Protobuf_PROTOC_EXECUTABLE) + +if(Protobuf_DEBUG) + message(STATUS "[ ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE} ] " + "requested version of Google Protobuf is ${Protobuf_FIND_VERSION}") +endif() + +if(Protobuf_INCLUDE_DIR) + set(_PROTOBUF_COMMON_HEADER ${Protobuf_INCLUDE_DIR}/google/protobuf/stubs/common.h) + + if(Protobuf_DEBUG) + message(STATUS "[ ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE} ] " + "location of common.h: ${_PROTOBUF_COMMON_HEADER}") + endif() + + set(Protobuf_VERSION "") + set(Protobuf_LIB_VERSION "") + file(STRINGS ${_PROTOBUF_COMMON_HEADER} _PROTOBUF_COMMON_H_CONTENTS REGEX "#define[ \t]+GOOGLE_PROTOBUF_VERSION[ \t]+") + if(_PROTOBUF_COMMON_H_CONTENTS MATCHES "#define[ \t]+GOOGLE_PROTOBUF_VERSION[ \t]+([0-9]+)") + set(Protobuf_LIB_VERSION "${CMAKE_MATCH_1}") + endif() + unset(_PROTOBUF_COMMON_H_CONTENTS) + + math(EXPR _PROTOBUF_MAJOR_VERSION "${Protobuf_LIB_VERSION} / 1000000") + math(EXPR _PROTOBUF_MINOR_VERSION "${Protobuf_LIB_VERSION} / 1000 % 1000") + math(EXPR _PROTOBUF_SUBMINOR_VERSION "${Protobuf_LIB_VERSION} % 1000") + set(Protobuf_VERSION "${_PROTOBUF_MAJOR_VERSION}.${_PROTOBUF_MINOR_VERSION}.${_PROTOBUF_SUBMINOR_VERSION}") + + if(Protobuf_DEBUG) + message(STATUS "[ ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE} ] " + "${_PROTOBUF_COMMON_HEADER} reveals protobuf ${Protobuf_VERSION}") + endif() + + if(Protobuf_PROTOC_EXECUTABLE) + # Check Protobuf compiler version to be aligned with libraries version + execute_process(COMMAND ${Protobuf_PROTOC_EXECUTABLE} --version + OUTPUT_VARIABLE _PROTOBUF_PROTOC_EXECUTABLE_VERSION) + + if("${_PROTOBUF_PROTOC_EXECUTABLE_VERSION}" MATCHES "libprotoc ([0-9.]+)") + set(_PROTOBUF_PROTOC_EXECUTABLE_VERSION "${CMAKE_MATCH_1}") + endif() + + if(Protobuf_DEBUG) + message(STATUS "[ ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE} ] " + "${Protobuf_PROTOC_EXECUTABLE} reveals version ${_PROTOBUF_PROTOC_EXECUTABLE_VERSION}") + endif() + + # protoc version 22 and up don't print the major version any more + if(NOT "${_PROTOBUF_PROTOC_EXECUTABLE_VERSION}" VERSION_EQUAL "${Protobuf_VERSION}" AND + NOT "${_PROTOBUF_PROTOC_EXECUTABLE_VERSION}" VERSION_EQUAL "${_PROTOBUF_MINOR_VERSION}.${_PROTOBUF_SUBMINOR_VERSION}") + message(WARNING "Protobuf compiler version ${_PROTOBUF_PROTOC_EXECUTABLE_VERSION}" + " doesn't match library version ${Protobuf_VERSION}") + endif() + endif() + + if(Protobuf_LIBRARY) + if(NOT TARGET protobuf::libprotobuf) + add_library(protobuf::libprotobuf UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}") + if(EXISTS "${Protobuf_LIBRARY}") + set_target_properties(protobuf::libprotobuf PROPERTIES + IMPORTED_LOCATION "${Protobuf_LIBRARY}") + endif() + if(EXISTS "${Protobuf_LIBRARY_RELEASE}") + set_property(TARGET protobuf::libprotobuf APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(protobuf::libprotobuf PROPERTIES + IMPORTED_LOCATION_RELEASE "${Protobuf_LIBRARY_RELEASE}") + endif() + if(EXISTS "${Protobuf_LIBRARY_DEBUG}") + set_property(TARGET protobuf::libprotobuf APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(protobuf::libprotobuf PROPERTIES + IMPORTED_LOCATION_DEBUG "${Protobuf_LIBRARY_DEBUG}") + endif() + if (Protobuf_VERSION VERSION_GREATER_EQUAL "3.6") + set_property(TARGET protobuf::libprotobuf APPEND PROPERTY + INTERFACE_COMPILE_FEATURES cxx_std_11 + ) + endif() + if (WIN32 AND NOT Protobuf_USE_STATIC_LIBS) + set_property(TARGET protobuf::libprotobuf APPEND PROPERTY + INTERFACE_COMPILE_DEFINITIONS "PROTOBUF_USE_DLLS" + ) + endif() + if(UNIX AND TARGET Threads::Threads) + set_property(TARGET protobuf::libprotobuf APPEND PROPERTY + INTERFACE_LINK_LIBRARIES Threads::Threads) + endif() + endif() + endif() + + if(Protobuf_LITE_LIBRARY) + if(NOT TARGET protobuf::libprotobuf-lite) + add_library(protobuf::libprotobuf-lite UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf-lite PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}") + if(EXISTS "${Protobuf_LITE_LIBRARY}") + set_target_properties(protobuf::libprotobuf-lite PROPERTIES + IMPORTED_LOCATION "${Protobuf_LITE_LIBRARY}") + endif() + if(EXISTS "${Protobuf_LITE_LIBRARY_RELEASE}") + set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(protobuf::libprotobuf-lite PROPERTIES + IMPORTED_LOCATION_RELEASE "${Protobuf_LITE_LIBRARY_RELEASE}") + endif() + if(EXISTS "${Protobuf_LITE_LIBRARY_DEBUG}") + set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(protobuf::libprotobuf-lite PROPERTIES + IMPORTED_LOCATION_DEBUG "${Protobuf_LITE_LIBRARY_DEBUG}") + endif() + if (WIN32 AND NOT Protobuf_USE_STATIC_LIBS) + set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY + INTERFACE_COMPILE_DEFINITIONS "PROTOBUF_USE_DLLS" + ) + endif() + if(UNIX AND TARGET Threads::Threads) + set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY + INTERFACE_LINK_LIBRARIES Threads::Threads) + endif() + endif() + endif() + + if(Protobuf_PROTOC_LIBRARY) + if(NOT TARGET protobuf::libprotoc) + add_library(protobuf::libprotoc UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotoc PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}") + if(EXISTS "${Protobuf_PROTOC_LIBRARY}") + set_target_properties(protobuf::libprotoc PROPERTIES + IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}") + endif() + if(EXISTS "${Protobuf_PROTOC_LIBRARY_RELEASE}") + set_property(TARGET protobuf::libprotoc APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(protobuf::libprotoc PROPERTIES + IMPORTED_LOCATION_RELEASE "${Protobuf_PROTOC_LIBRARY_RELEASE}") + endif() + if(EXISTS "${Protobuf_PROTOC_LIBRARY_DEBUG}") + set_property(TARGET protobuf::libprotoc APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(protobuf::libprotoc PROPERTIES + IMPORTED_LOCATION_DEBUG "${Protobuf_PROTOC_LIBRARY_DEBUG}") + endif() + if (Protobuf_VERSION VERSION_GREATER_EQUAL "3.6") + set_property(TARGET protobuf::libprotoc APPEND PROPERTY + INTERFACE_COMPILE_FEATURES cxx_std_11 + ) + endif() + if (WIN32 AND NOT Protobuf_USE_STATIC_LIBS) + set_property(TARGET protobuf::libprotoc APPEND PROPERTY + INTERFACE_COMPILE_DEFINITIONS "PROTOBUF_USE_DLLS" + ) + endif() + if(UNIX AND TARGET Threads::Threads) + set_property(TARGET protobuf::libprotoc APPEND PROPERTY + INTERFACE_LINK_LIBRARIES Threads::Threads) + endif() + endif() + endif() + + if(Protobuf_PROTOC_EXECUTABLE) + if(NOT TARGET protobuf::protoc) + add_executable(protobuf::protoc IMPORTED) + if(EXISTS "${Protobuf_PROTOC_EXECUTABLE}") + set_target_properties(protobuf::protoc PROPERTIES + IMPORTED_LOCATION "${Protobuf_PROTOC_EXECUTABLE}") + endif() + endif() + endif() +endif() + +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(Protobuf + REQUIRED_VARS Protobuf_LIBRARIES Protobuf_INCLUDE_DIR + VERSION_VAR Protobuf_VERSION +) + +if(Protobuf_FOUND) + set(Protobuf_INCLUDE_DIRS ${Protobuf_INCLUDE_DIR}) +endif() + +# Restore the original find library ordering +if( Protobuf_USE_STATIC_LIBS ) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${_protobuf_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) +endif() + +# Backwards compatibility +# Define upper case versions of output variables +foreach(Camel + Protobuf_SRC_ROOT_FOLDER + Protobuf_IMPORT_DIRS + Protobuf_DEBUG + Protobuf_INCLUDE_DIRS + Protobuf_LIBRARIES + Protobuf_PROTOC_LIBRARIES + Protobuf_LITE_LIBRARIES + Protobuf_LIBRARY + Protobuf_PROTOC_LIBRARY + Protobuf_INCLUDE_DIR + Protobuf_PROTOC_EXECUTABLE + Protobuf_LIBRARY_DEBUG + Protobuf_PROTOC_LIBRARY_DEBUG + Protobuf_LITE_LIBRARY + Protobuf_LITE_LIBRARY_DEBUG + ) + string(TOUPPER ${Camel} UPPER) + set(${UPPER} ${${Camel}}) +endforeach() From 4ef806e3169971b7e3d64c42eec1ecd0b7cd7e0f Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 14 Jan 2026 13:50:10 +0100 Subject: [PATCH 05/20] Containerfile: install all required Protobuf libs for `OTel` --- Containerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Containerfile b/Containerfile index 3312c0212cd..ad3e2302b8a 100644 --- a/Containerfile +++ b/Containerfile @@ -38,6 +38,8 @@ RUN apt-get update && \ libpq-dev \ libssl-dev \ libsystemd-dev \ + libprotobuf-dev \ + protobuf-compiler \ make && \ rm -rf /var/lib/apt/lists/* @@ -165,6 +167,7 @@ RUN apt-get update && \ libmariadb3 \ libmoosex-role-timer-perl \ libpq5 \ + libprotobuf-lite32t64 \ libssl3 \ libsystemd0 \ mailutils \ From c34e03078ad446933086a621b7fa301f7c6dc7d2 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 13 Jan 2026 14:50:41 +0100 Subject: [PATCH 06/20] GHA: install required protobuf devel package --- .github/workflows/linux.bash | 62 +++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/.github/workflows/linux.bash b/.github/workflows/linux.bash index dffe4c45664..516c1ed37d9 100755 --- a/.github/workflows/linux.bash +++ b/.github/workflows/linux.bash @@ -5,6 +5,8 @@ export PATH="/usr/lib/ccache/bin:/usr/lib/ccache:/usr/lib64/ccache:$PATH" export CCACHE_DIR=/icinga2/ccache export CTEST_OUTPUT_ON_FAILURE=1 CMAKE_OPTS=() +SCL_ENABLE_GCC=() +PROTOBUF_INCLUDE_DIR="" # -Wstringop-overflow is notorious for false positives and has been a problem for years. # See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88443 # -Wtemplate-id-cdtor leaks from using the generated headers. We should reenable this once @@ -17,7 +19,7 @@ case "$DISTRO" in # - LibreSSL instead of OpenSSL 3 and # - no MariaDB or libpq as they depend on OpenSSL. # https://gitlab.alpinelinux.org/alpine/aports/-/blob/master/community/icinga2/APKBUILD - apk add bison boost-dev ccache cmake flex g++ libedit-dev libressl-dev ninja-build tzdata + apk add bison boost-dev ccache cmake flex g++ libedit-dev libressl-dev ninja-build tzdata protobuf-dev ln -vs /usr/lib/ninja-build/bin/ninja /usr/local/bin/ninja ;; @@ -44,24 +46,24 @@ case "$DISTRO" in amazonlinux:20*) dnf install -y amazon-rpm-config bison cmake flex gcc-c++ ninja-build \ - {boost,libedit,mariadb-connector-c,ncurses,openssl,postgresql,systemd}-devel + {boost,libedit,mariadb-connector-c,ncurses,openssl,postgresql,systemd,protobuf-lite}-devel ;; debian:*|ubuntu:*) apt-get update DEBIAN_FRONTEND=noninteractive apt-get install --no-install-{recommends,suggests} -y \ - bison ccache cmake dpkg-dev flex g++ ninja-build tzdata \ - lib{boost-all,edit,mariadb,ncurses,pq,ssl,systemd}-dev + bison ccache cmake dpkg-dev flex g++ ninja-build tzdata protobuf-compiler \ + lib{boost-all,edit,mariadb,ncurses,pq,ssl,systemd,protobuf}-dev ;; fedora:*) dnf install -y bison ccache cmake flex gcc-c++ ninja-build redhat-rpm-config \ - {boost,libedit,mariadb,ncurses,openssl,postgresql,systemd}-devel + {boost,libedit,mariadb,ncurses,openssl,postgresql,systemd,protobuf-lite}-devel ;; *suse*) zypper in -y bison ccache cmake flex gcc-c++ ninja rpm-config-SUSE \ - {lib{edit,mariadb,openssl},ncurses,postgresql,systemd}-devel \ + {lib{edit,mariadb,openssl},ncurses,postgresql,systemd,protobuf}-devel \ libboost_{context,coroutine,filesystem,iostreams,program_options,regex,system,test,thread}-devel ;; @@ -71,6 +73,10 @@ case "$DISTRO" in case "$DISTRO" in *:8) dnf config-manager --enable powertools + # Our Protobuf package on RHEL 8 is built with GCC 13, and since the ABI is not compatible with GCC 8, + # we need to enable the SCL repository and install the GCC 13 packages to be able to link against it. + SCL_ENABLE_GCC=(scl enable gcc-toolset-13 --) + dnf install -y gcc-toolset-13-gcc-c++ gcc-toolset-13-annobin-plugin-gcc ;; *) dnf config-manager --enable crb @@ -79,6 +85,29 @@ case "$DISTRO" in dnf install -y bison ccache cmake gcc-c++ flex ninja-build redhat-rpm-config \ {boost,bzip2,libedit,mariadb,ncurses,openssl,postgresql,systemd,xz,libzstd}-devel + + # Rocky Linux 8 and 9 don't have a recent enough Protobuf compiler for OTel, so we need to add + # our repository to install the pre-built Protobuf devel package. + case "$DISTRO" in + *:[8-9]) + rpm --import https://packages.icinga.com/icinga.key + cat > /etc/yum.repos.d/icinga-build-deps.repo <<'EOF' +[icinga-build-deps] +name=Icinga Build Dependencies +baseurl=https://packages.icinga.com/build-dependencies/rhel/$releasever/release +enabled=1 +gpgcheck=1 +gpgkey=https://packages.icinga.com/icinga.key +EOF + dnf install -y icinga-protobuf + # And of course, make sure to add our custom Protobuf includes to the compiler include path. + PROTOBUF_INCLUDE_DIR="-isystem $(rpm -E '%{_includedir}')/icinga-protobuf" + # Tell CMake where to find our own Protobuf CMake config files. + CMAKE_OPTS+=(-DCMAKE_PREFIX_PATH="$(rpm -E '%{_libdir}')/icinga-protobuf/cmake") + ;; + *) + dnf install -y protobuf-lite-devel + esac ;; esac @@ -96,9 +125,20 @@ case "$DISTRO" in source <(dpkg-buildflags --export=sh) export CFLAGS="${CFLAGS} ${WARN_FLAGS}" export CXXFLAGS="${CXXFLAGS} ${WARN_FLAGS}" + + # The default Protobuf compiler is too old for OTel, so we need to turn it off on Debian 11 and Ubuntu 22.04. + case "$DISTRO" in + debian:11|ubuntu:22.04) + CMAKE_OPTS+=(-DICINGA2_WITH_OPENTELEMETRY=OFF) + ;; + esac ;; *) - CMAKE_OPTS+=(-DCMAKE_{C,CXX}_FLAGS="$(rpm -E '%{optflags} %{?march_flag}') ${WARN_FLAGS}") + # Turn off with OTel on Amazon Linux 2 as the default Protobuf compiler is way too old. + if [ "$DISTRO" = "amazonlinux:2" ]; then + CMAKE_OPTS+=(-DICINGA2_WITH_OPENTELEMETRY=OFF) + fi + CMAKE_OPTS+=(-DCMAKE_{C,CXX}_FLAGS="$(rpm -E '%{optflags} %{?march_flag}') ${WARN_FLAGS} ${PROTOBUF_INCLUDE_DIR}") export LDFLAGS="$(rpm -E '%{?build_ldflags}')" ;; esac @@ -106,7 +146,7 @@ esac mkdir /icinga2/build cd /icinga2/build -cmake \ +"${SCL_ENABLE_GCC[@]}" cmake \ -GNinja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DICINGA2_UNITY_BUILD=ON \ @@ -115,8 +155,8 @@ cmake \ -DICINGA2_GROUP=$(id -gn) \ "${CMAKE_OPTS[@]}" .. -ninja -v +"${SCL_ENABLE_GCC[@]}" ninja -v -ninja test -ninja install +"${SCL_ENABLE_GCC[@]}" ninja test +"${SCL_ENABLE_GCC[@]}" ninja install icinga2 daemon -C From 60fe45cd6e06e5e32cdf56df9d254e32c0a0f565 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 15 Jan 2026 11:25:04 +0100 Subject: [PATCH 07/20] Add `OTLPMetricsWriter` --- .../features-available/otlpmetrics.conf | 41 ++ lib/perfdata/CMakeLists.txt | 28 ++ lib/perfdata/otlpmetricswriter.cpp | 385 ++++++++++++++++++ lib/perfdata/otlpmetricswriter.hpp | 61 +++ lib/perfdata/otlpmetricswriter.ti | 68 ++++ tools/mkclass/classcompiler.cpp | 2 +- 6 files changed, 584 insertions(+), 1 deletion(-) create mode 100644 etc/icinga2/features-available/otlpmetrics.conf create mode 100644 lib/perfdata/otlpmetricswriter.cpp create mode 100644 lib/perfdata/otlpmetricswriter.hpp create mode 100644 lib/perfdata/otlpmetricswriter.ti diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf new file mode 100644 index 00000000000..39a2cacb027 --- /dev/null +++ b/etc/icinga2/features-available/otlpmetrics.conf @@ -0,0 +1,41 @@ +/** + * The OpenTelemetry Metrics Writer feature allows Icinga 2 to export metrics from performance + * data to an OpenTelemetry Collector or compatible backend. + * + * For more information, see the official documentation: + * https://icinga.com/docs/icinga-2/latest/doc/14-features/#otlpmetrics-writer + */ +object OTLPMetricsWriter "otlp-metrics" { + // host = "127.0.0.1" + // port = 4318 + // metrics_endpoint = "/v1/metrics" + # Optionally, you can set a namespace to be used as OTel service.namespace attribute for all exported metrics. + // service_namespace = "icinga" + + # By default, basic AUTH is disabled. Uncomment and set the following lines to enable it. + // basic_auth = { + // username = "otel_user" + // password = "otel_password" + // } + + # These are the default settings used by the OTel writer. Adjust them as needed. + # Please refer to the documentation for more details on each option. + // enable_ha = false + // flush_interval = 15s + // flush_threshold = 32*1024*1024 + # When stopping Icinga 2, this timeout defines how long to wait for any pending OTel + # metrics to be sent before disconnecting and discarding them. + // disconnect_timeout = 10s + + # Allow the OTLP writer to send the check thresholds as OTel metrics to the configured endpoint. + # By default, this is disabled but you can enable it to have the thresholds available in the `state_check.threshold` OTel metric. + // enable_send_thresholds = false + + # You can enable TLS encryption by uncommenting and configuring the following options. + # By default, the OTel writer uses unencrypted connections (plain HTTP requests). + // enable_tls = false + // tl_insecure_noverify = false + // tls_ca_file = "/path/to/otel/ca.crt" + // tls_cert_file = "/path/to/otel/client.crt" + // tls_key_file = "/path/to/otel/client.key" +} diff --git a/lib/perfdata/CMakeLists.txt b/lib/perfdata/CMakeLists.txt index c53e9e9b69e..c867911be4a 100644 --- a/lib/perfdata/CMakeLists.txt +++ b/lib/perfdata/CMakeLists.txt @@ -22,6 +22,13 @@ set(perfdata_SOURCES perfdatawriterconnection.cpp perfdatawriterconnection.hpp ) +if(ICINGA2_WITH_OPENTELEMETRY) + mkclass_target(otlpmetricswriter.ti otlpmetricswriter-ti.cpp otlpmetricswriter-ti.hpp) + list(APPEND perfdata_SOURCES + otlpmetricswriter.cpp otlpmetricswriter.hpp otlpmetricswriter-ti.hpp + ) +endif() + if(ICINGA2_UNITY_BUILD) mkunity_target(perfdata perfdata perfdata_SOURCES) endif() @@ -29,6 +36,20 @@ endif() add_library(perfdata OBJECT ${perfdata_SOURCES}) add_dependencies(perfdata base config icinga) +if(ICINGA2_WITH_OPENTELEMETRY) + add_dependencies(perfdata otel) + # All the Protobuf generated files within the otel target use relative include paths that won't be + # resolved unless we also add the include directories of the otel target. Meaning, we include some + # of the header files (not the generated ones) from otel within the otlpwriter and these headers + # again include the generated headers and the generated headers in return include other generated + # headers using relative paths like this: + # #include "opentelemetry/proto/metrics/v1/metrics.pb.h" + # + # This path can only be resolved if the parent directory of "opentelemetry" is added to the compiler's + # include search paths, which is done by the CMakefile of the otel target and we only need to propagate + # its include directories to the perfdata target. + target_include_directories(perfdata PUBLIC $) +endif() set_target_properties ( perfdata PROPERTIES @@ -65,6 +86,13 @@ install_if_not_exists( ${ICINGA2_CONFIGDIR}/features-available ) +if(ICINGA2_WITH_OPENTELEMETRY) + install_if_not_exists( + ${PROJECT_SOURCE_DIR}/etc/icinga2/features-available/otlpmetrics.conf + ${ICINGA2_CONFIGDIR}/features-available + ) +endif() + install_if_not_exists( ${PROJECT_SOURCE_DIR}/etc/icinga2/features-available/perfdata.conf ${ICINGA2_CONFIGDIR}/features-available diff --git a/lib/perfdata/otlpmetricswriter.cpp b/lib/perfdata/otlpmetricswriter.cpp new file mode 100644 index 00000000000..2ad480c7127 --- /dev/null +++ b/lib/perfdata/otlpmetricswriter.cpp @@ -0,0 +1,385 @@ +// SPDX-FileCopyrightText: 2026 Icinga GmbH +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "perfdata/otlpmetricswriter.hpp" +#include "perfdata/otlpmetricswriter-ti.cpp" +#include "base/base64.hpp" +#include "base/defer.hpp" +#include "base/json.hpp" +#include "base/object-packer.hpp" +#include "base/perfdatavalue.hpp" +#include "base/statsfunction.hpp" +#include "icinga/checkable.hpp" +#include "icinga/checkcommand.hpp" +#include "icinga/service.hpp" +#include + +using namespace icinga; + +REGISTER_TYPE(OTLPMetricsWriter); + +REGISTER_STATSFUNCTION(OTLPMetricsWriter, &OTLPMetricsWriter::StatsFunc); + +// Represent our currently supported metric streams. +// +// Note: These and all other attribute keys used within this compilation unit follow +// the OTel general naming guidelines[^1] and conventions[^2]. +// +// [^1]: https://opentelemetry.io/docs/specs/semconv/general/metrics/#general-guidelines +// [^2]: https://opentelemetry.io/docs/specs/semconv/general/naming +static constexpr std::string_view l_PerfdataMetric = "state_check.perfdata"; +static constexpr std::string_view l_ThresholdMetric = "state_check.threshold"; + +void OTLPMetricsWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) +{ + DictionaryData statusData; + for (const Ptr& otlpWriter : ConfigType::GetObjectsByType()) { + std::size_t workQueueSize = otlpWriter->m_WorkQueue.GetLength(); + double workQueueItemRate = otlpWriter->m_WorkQueue.GetTaskCount(60) / 60.0; + std::size_t dataPointsCount = otlpWriter->m_DataPointsCount.load(std::memory_order_relaxed); + uint64_t messageSize = otlpWriter->m_RecordedBytes.load(std::memory_order_relaxed); + + const auto name = otlpWriter->GetName(); + statusData.emplace_back(name, new Dictionary{ + {"work_queue_items", workQueueSize}, + {"work_queue_item_rate", workQueueItemRate}, + {"data_buffer_items", dataPointsCount}, + {"data_buffer_bytes", messageSize}, + }); + + perfdata->Add(new PerfdataValue("otlpmetricswriter_" + name + "_work_queue_items", workQueueSize, true)); + perfdata->Add(new PerfdataValue("otlpmetricswriter_" + name + "_work_queue_item_rate", workQueueItemRate)); + perfdata->Add(new PerfdataValue("otlpmetricswriter_" + name + "_data_buffer_items", dataPointsCount, true)); + perfdata->Add(new PerfdataValue("otlpmetricswriter_" + name + "_data_buffer_bytes", messageSize, false, "bytes")); + } + status->Set("otlpmetricswriter", new Dictionary{std::move(statusData)}); +} + +void OTLPMetricsWriter::OnConfigLoaded() +{ + ObjectImpl::OnConfigLoaded(); + + m_WorkQueue.SetName("OTLPMetricsWriter, " + GetName()); + + if (!GetEnableHa()) { + Log(LogDebug, "OTLPMetricsWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } +} + +void OTLPMetricsWriter::Start(bool runtimeCreated) +{ + ObjectImpl::Start(runtimeCreated); + + OTelConnInfo connInfo; + connInfo.EnableTls = GetEnableTls(); + connInfo.VerifyPeerCertificate = !GetTlsInsecureNoverify(); + connInfo.Host = GetHost(); + connInfo.Port = GetPort(); + connInfo.TlsCaCrt = GetTlsCaFile(); + connInfo.TlsCrt = GetTlsCertFile(); + connInfo.TlsKey = GetTlsKeyFile(); + connInfo.MetricsEndpoint = GetMetricsEndpoint(); + if (auto auth = GetBasicAuth(); auth) { + connInfo.BasicAuth = Base64::Encode(auth->Get("username") + ":" + auth->Get("password")); + } + + m_Exporter.reset(new OTel{connInfo}); +} + +void OTLPMetricsWriter::Resume() +{ + ObjectImpl::Resume(); + + Log(LogInformation, "OTLPMetricsWriter") + << "'" << GetName() << "' resumed."; + + m_WorkQueue.SetExceptionCallback([](boost::exception_ptr exp) { + Log(LogCritical, "OTLPMetricsWriter") + << "Exception while producing OTel metric: " << DiagnosticInformation(exp); + }); + + m_FlushTimer = Timer::Create(); + m_FlushTimer->SetInterval(GetFlushInterval()); + m_FlushTimer->OnTimerExpired.connect([this](const Timer* const&) { + if (m_TimerFlushInProgress.exchange(true, std::memory_order_relaxed)) { + // Previous timer-initiated flush still in progress, skip this one. + return; + } + m_WorkQueue.Enqueue([this] { + Defer resetTimerFlag{[this] { m_TimerFlushInProgress.store(false, std::memory_order_relaxed); }}; + Flush(true); + }); + }); + m_FlushTimer->Start(); + m_Exporter->Start(); + + m_CheckResultsSlot = Checkable::OnNewCheckResult.connect([this]( + const Checkable::Ptr& checkable, + const CheckResult::Ptr& cr, + const MessageOrigin::Ptr& + ) { + CheckResultHandler(checkable, cr); + }); + m_ActiveChangedSlot = OnActiveChanged.connect([this](const ConfigObject::Ptr& obj, const Value&) { + auto checkable = dynamic_pointer_cast(obj); + if (!checkable || checkable->IsActive()) { + return; + } + m_WorkQueue.Enqueue([this, checkable] { m_Metrics.erase(checkable.get()); }); + }); +} + +void OTLPMetricsWriter::Pause() +{ + m_CheckResultsSlot.disconnect(); + m_ActiveChangedSlot.disconnect(); + + m_FlushTimer->Stop(true); + + std::promise promise; + auto future = promise.get_future(); + m_WorkQueue.Enqueue([this, &promise] { + Flush(); + promise.set_value(); + }, PriorityLow); + + if (auto status = future.wait_for(std::chrono::seconds(GetDisconnectTimeout())); status != std::future_status::ready) { + Log(LogWarning, "OTLPMetricsWriter") + << "Disconnect timeout reached while flushing OTel metrics, discarding '" << m_DataPointsCount + << "' data points ('" << m_RecordedBytes << "' bytes)."; + } + m_Exporter->Stop(); + m_WorkQueue.Join(); + + m_Metrics.clear(); + + Log(LogInformation, "OTLPMetricsWriter") + << "'" << GetName() << "' paused."; + + ObjectImpl::Pause(); +} + +void OTLPMetricsWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) +{ + if (!IcingaApplication::GetInstance()->GetEnablePerfdata() || !checkable->GetEnablePerfdata() || !cr->GetPerformanceData()) { + return; + } + + m_WorkQueue.Enqueue([this, checkable, cr] { + if (m_Exporter->Stopped()) { + return; + } + CONTEXT("Processing check result for '" << checkable->GetName() << "'."); + + auto startTime = cr->GetScheduleStart(); + auto endTime = cr->GetExecutionEnd(); + + Array::Ptr perfdata = cr->GetPerformanceData(); + ObjectLock olock(perfdata); + for (const Value& val : perfdata) { + PerfdataValue::Ptr pdv; + if (val.IsObjectType()) { + pdv = val; + } else { + try { + pdv = PerfdataValue::Parse(val); + } catch (const std::exception&) { + Log(LogWarning, "OTLPMetricsWriter") + << "Ignoring invalid perfdata for checkable '" << checkable->GetName() << "' and command '" + << checkable->GetCheckCommand()->GetName() << "' with value: " << val; + continue; + } + } + + OTel::AttrsMap attrs{{"perfdata_label", pdv->GetLabel()}}; + if (auto unit = pdv->GetUnit(); !unit.IsEmpty()) { + attrs.emplace("unit", std::move(unit)); + } + AddBytesAndFlushIfNeeded(Record(checkable, l_PerfdataMetric, pdv->GetValue(), startTime, endTime, std::move(attrs))); + + if (GetEnableSendThresholds()) { + std::array, 4> thresholds{{ + {"critical", pdv->GetCrit()}, + {"warning", pdv->GetWarn()}, + {"min", pdv->GetMin()}, + {"max", pdv->GetMax()}, + }}; + for (auto& [label, threshold] : thresholds) { + if (!threshold.IsEmpty()) { + attrs = { + {"perfdata_label", pdv->GetLabel()}, + {"threshold_type", std::move(label)}, + }; + AddBytesAndFlushIfNeeded( + Record( + checkable, + l_ThresholdMetric, + Convert::ToDouble(threshold), + startTime, + endTime, + std::move(attrs) + ) + ); + } + } + } + } + }); +} + +void OTLPMetricsWriter::Flush(bool fromTimer) +{ + // If previous export is still in progress and this flush is requested from timer, skip it. + // For manual flushes (e.g., due to reaching flush threshold), we want to block until + // the previous export is done before returning to the caller (blocking is handled in OTel::Export()). + if (fromTimer && m_Exporter->Exporting()) { + return; + } + + Log(LogDebug, "OTLPMetricsWriter") + << "Flushing OTel metrics to OpenTelemetry backend" << (fromTimer ? " (timer expired)." : "."); + + auto request = std::make_unique(); + for (auto& [checkable, resourceMetrics] : m_Metrics) { + if (resourceMetrics) { + request->mutable_resource_metrics()->AddAllocated(resourceMetrics.release()); + } + } + if (request->resource_metrics_size() == 0) { + Log(LogDebug, "OTLPMetricsWriter") + << "Not flushing OTel metrics: No data points recorded."; + return; + } + m_Exporter->Export(std::move(request)); + m_RecordedBytes.store(0, std::memory_order_relaxed); + m_DataPointsCount.store(0, std::memory_order_relaxed); +} + +void OTLPMetricsWriter::AddBytesAndFlushIfNeeded(std::size_t newBytes) +{ + auto existingBytes = m_RecordedBytes.fetch_add(newBytes, std::memory_order_relaxed); + if (auto bytes{existingBytes + newBytes}; bytes >= static_cast(GetFlushThreshold())) { + Log(LogDebug, "OTLPMetricsWriter") + << "Flush threshold reached, flushing '" << bytes << "' bytes of OTel metrics."; + Flush(); + } +} + +/** + * Record a data point for the specified OTel metric associated with the given configuration object. + * + * This method records a data point of type T for the specified metric name associated with the + * provided configuration object. If the metric does not exist for the object, it is created. + * + * @tparam T The type of the data point to record (e.g., int64_t, double). + * + * @param checkable The configuration object to associate the metric with. + * @param metric The OTel metric enum value indicating which metric stream to record the data point for. + * @param value The data point value to record. + * @param startTime The start time of the data point in seconds. + * @param endTime The end time of the data point in seconds. + * @param attrs The attributes associated with the data point. + * + * @return The number of bytes recorded for this data point, which contributes to the flush threshold. + */ +template +std::size_t OTLPMetricsWriter::Record( + const Checkable::Ptr& checkable, + std::string_view metric, + T value, + double startTime, + double endTime, + OTel::AttrsMap attrs +) +{ + std::size_t bytes = 0; + auto& resourceMetrics = m_Metrics[checkable.get()]; + if (!resourceMetrics) { + using namespace std::string_view_literals; + + resourceMetrics = std::make_unique(); + OTel::PopulateResourceAttrs(resourceMetrics); + + auto* resource = resourceMetrics->mutable_resource(); + auto* attr = resource->add_attributes(); + OTel::SetAttribute(*attr, "service.namespace"sv, GetServiceNamespace()); + + auto [host, service] = GetHostService(checkable); + attr = resource->add_attributes(); + OTel::SetAttribute(*attr, "icinga2.host.name"sv, host->GetName()); + + // Add entity reference (https://opentelemetry.io/docs/specs/otel/entities/data-model/). + auto* entity = resource->add_entity_refs(); + entity->mutable_id_keys()->Add("icinga2.host.name"); + if (service) { + entity->set_type("service"); + entity->mutable_id_keys()->Add("icinga2.service.name"); + + attr = resource->add_attributes(); + OTel::SetAttribute(*attr, "icinga2.service.name"sv, service->GetShortName()); + } else { + entity->set_type("host"); + } + attr = resource->add_attributes(); + OTel::SetAttribute(*attr, "icinga2.command.name"sv, checkable->GetCheckCommand()->GetName()); + bytes = resourceMetrics->ByteSizeLong(); + } + + auto* sm = resourceMetrics->mutable_scope_metrics(0); + auto* metrics = sm->mutable_metrics(); + auto it = std::find_if(metrics->begin(), metrics->end(), [metric](const auto& m) { return m.name() == metric; }); + OTel::Gauge* gaugePtr = nullptr; + if (it == metrics->end()) { + OTel::ValidateName(metric); + auto* metricPtr = sm->add_metrics(); + metricPtr->set_name(std::string(metric)); + bytes += metricPtr->ByteSizeLong(); // Account for metric name size in bytes. + gaugePtr = metricPtr->mutable_gauge(); + } else { + gaugePtr = it->mutable_gauge(); + } + bytes += OTel::Record(*gaugePtr, value, startTime, endTime, std::move(attrs)); + m_DataPointsCount.fetch_add(1, std::memory_order_relaxed); + return bytes; +} + +void OTLPMetricsWriter::ValidatePort(const Lazy& lvalue, const ValidationUtils& utils) +{ + ObjectImpl::ValidatePort(lvalue, utils); + if (auto p = lvalue(); p < 1 || p > 65535) { + BOOST_THROW_EXCEPTION(ValidationError(this, {"port"}, "Port must be in the range 1-65535.")); + } +} + +void OTLPMetricsWriter::ValidateFlushInterval(const Lazy& lvalue, const ValidationUtils& utils) +{ + ObjectImpl::ValidateFlushInterval(lvalue, utils); + if (lvalue() < 1) { + BOOST_THROW_EXCEPTION(ValidationError(this, {"flush_interval"}, "Flush interval must be at least 1 second.")); + } +} + +void OTLPMetricsWriter::ValidateFlushThreshold(const Lazy& lvalue, const ValidationUtils& utils) +{ + ObjectImpl::ValidateFlushThreshold(lvalue, utils); + if (lvalue() < 1) { + BOOST_THROW_EXCEPTION(ValidationError(this, {"flush_threshold"}, "Flush threshold must be at least 1.")); + } + // Protobuf limits the size of messages to be serialiazed/deserialized to max 2GiB. Thus, we can't accept + // a flush threshold that would exceed that limit with a reasonable safe margin of 10MiB for any other + // overhead in the message not accounted for in @c m_RecordedBytes. + // See https://protobuf.dev/programming-guides/proto-limits/#total. + constexpr std::size_t maxMessageSize = 2ULL * 1024 * 1024 * 1024 - 10 * 1024 * 1024; + if (static_cast(lvalue()) > maxMessageSize) { + BOOST_THROW_EXCEPTION(ValidationError( + this, + {"flush_threshold"}, + "Flush threshold too high, would exceed Protobuf message size limit of 2GiB (1.9GiB max allowed)." + )); + } +} diff --git a/lib/perfdata/otlpmetricswriter.hpp b/lib/perfdata/otlpmetricswriter.hpp new file mode 100644 index 00000000000..a473e5798f6 --- /dev/null +++ b/lib/perfdata/otlpmetricswriter.hpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2026 Icinga GmbH +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "perfdata/otlpmetricswriter-ti.hpp" +#include "base/workqueue.hpp" +#include "icinga/checkable.hpp" +#include "otel/otel.hpp" +#include + +namespace icinga +{ + +class OTLPMetricsWriter final : public ObjectImpl +{ +public: + DECLARE_OBJECT(OTLPMetricsWriter); + DECLARE_OBJECTNAME(OTLPMetricsWriter); + + static void StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata); + + void Start(bool runtimeCreated) override; + void OnConfigLoaded() override; + void Resume() override; + void Pause() override; + +protected: + void ValidatePort(const Lazy& lvalue, const ValidationUtils& utils) override; + void ValidateFlushInterval(const Lazy& lvalue, const ValidationUtils& utils) override; + void ValidateFlushThreshold(const Lazy& lvalue, const ValidationUtils& utils) override; + +private: + void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr); + void Flush(bool fromTimer = false); + void AddBytesAndFlushIfNeeded(std::size_t newBytes = 0); + + template + [[nodiscard]] std::size_t Record( + const Checkable::Ptr& checkable, + std::string_view metric, + T value, + double startTime, + double endTime, + OTel::AttrsMap attrs + ); + + std::atomic_uint64_t m_RecordedBytes{0}; // Total bytes recorded in the current OTel message. + std::atomic_uint64_t m_DataPointsCount{0}; // Total data points recorded in the current OTel message. + + // Checkables and their associated OTel ResourceMetrics that are being recorded for the current OTel message. + std::unordered_map> m_Metrics; + + WorkQueue m_WorkQueue{10'000'000, 1}; + boost::signals2::connection m_CheckResultsSlot, m_ActiveChangedSlot; + OTel::Ptr m_Exporter; + Timer::Ptr m_FlushTimer; + std::atomic_bool m_TimerFlushInProgress{false}; // Whether a timer-initiated flush is in progress. +}; + +} // namespace icinga diff --git a/lib/perfdata/otlpmetricswriter.ti b/lib/perfdata/otlpmetricswriter.ti new file mode 100644 index 00000000000..3423305a69e --- /dev/null +++ b/lib/perfdata/otlpmetricswriter.ti @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: 2026 Icinga GmbH +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "base/configobject.hpp" + +library perfdata; + +namespace icinga +{ + +class OTLPMetricsWriter : ConfigObject +{ + activation_priority 100; + + [config, required, no_user_modify] String host { + default {{{ return "127.0.0.1"; }}} + }; + [config, no_user_modify] int port { + default {{{ return 4318; }}} + }; + [config, required, no_user_modify] String metrics_endpoint { + default {{{ return "/v1/metrics"; }}} + }; + + [config, required] String service_namespace { + default {{{ return "icinga"; }}} + }; + + [config, no_user_view, no_user_modify] Dictionary::Ptr basic_auth; + + [config] int flush_interval { + default {{{ return 15; }}} + }; + [config] int64_t flush_threshold { + default {{{ return 32 * 1024 * 1024; }}} + }; + [config] bool enable_ha { + default {{{ return false; }}} + }; + [config] bool enable_send_thresholds { + default {{{ return false; }}} + }; + [config] int disconnect_timeout { + default {{{ return 10; }}} + }; + + [config, no_user_modify] bool enable_tls { + default {{{ return false; }}} + }; + [config, no_user_modify] bool tls_insecure_noverify { + default {{{ return false; }}} + }; + [config, no_user_modify] String tls_ca_file; + [config, no_user_modify] String tls_cert_file; + [config, no_user_modify] String tls_key_file; +}; + +validator OTLPMetricsWriter +{ + Dictionary basic_auth { + required username; + String username; + required password; + String password; + }; +}; + +} // namespace icinga diff --git a/tools/mkclass/classcompiler.cpp b/tools/mkclass/classcompiler.cpp index e32d795d582..b17486cfe22 100644 --- a/tools/mkclass/classcompiler.cpp +++ b/tools/mkclass/classcompiler.cpp @@ -152,7 +152,7 @@ static std::string FieldTypeToIcingaName(const Field& field, bool inner) if (field.Attributes & FAEnum) return "Number"; - if (ftype == "int" || ftype == "double") + if (ftype == "int" || ftype == "int64_t" || ftype == "double") return "Number"; else if (ftype == "bool") return "Boolean"; From 8d4a69e34382293d161738b2bea87bde06a23be9 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 20 Jan 2026 17:38:58 +0100 Subject: [PATCH 08/20] docs: document `OTLPMetricsWriter` feature --- doc/06-distributed-monitoring.md | 1 + doc/09-object-types.md | 37 +++++ doc/14-features.md | 249 +++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+) diff --git a/doc/06-distributed-monitoring.md b/doc/06-distributed-monitoring.md index d38b83612c0..e03f22bcc68 100644 --- a/doc/06-distributed-monitoring.md +++ b/doc/06-distributed-monitoring.md @@ -2959,6 +2959,7 @@ By default, the following features provide advanced HA functionality: * [Graphite](09-object-types.md#objecttype-graphitewriter) * [InfluxDB](09-object-types.md#objecttype-influxdb2writer) (v1 and v2) * [OpenTsdb](09-object-types.md#objecttype-opentsdbwriter) +* [OTLPMetrics](09-object-types.md#objecttype-otlpmetricswriter) * [Perfdata](09-object-types.md#objecttype-perfdatawriter) (for PNP) #### High-Availability with Checks diff --git a/doc/09-object-types.md b/doc/09-object-types.md index 4caf327dee6..8bff4fb9a69 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -1871,6 +1871,43 @@ Configuration Attributes: host_template | Dictionary | **Optional.** Specify additional tags to be included with host metrics. This requires a sub-dictionary named `tags`. Also specify a naming prefix by setting `metric`. More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags) and [OpenTSDB Metric Prefix](14-features.md#opentsdb-metric-prefix). More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags). Defaults to an `empty Dictionary`. service_template | Dictionary | **Optional.** Specify additional tags to be included with service metrics. This requires a sub-dictionary named `tags`. Also specify a naming prefix by setting `metric`. More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags) and [OpenTSDB Metric Prefix](14-features.md#opentsdb-metric-prefix). Defaults to an `empty Dictionary`. +### OTLPMetricsWriter + +Emits metrics in [OpenTelemetry Protocol (OTLP)](https://opentelemetry.io/) format to a defined OpenTelemetry Collector +or any other OTLP-compatible backend that accepts OTLP data over HTTP. This configuration object is available as +[otlpmetrics feature](14-features.md#otlpmetrics-writer). You can find more information about OpenTelemetry and OTLP +on the [OpenTelemetry website](https://opentelemetry.io/). + +A basic copy and pastable example configuration is shown below: + +``` +object OTLPMetricsWriter "otlp-metrics" { + host = "127.0.0.1" + port = 4318 + metrics_endpoint = "/v1/metrics" + service_namespace = "icinga2-production" +} +``` + +There are more configuration options available as described in the table below. + +| Name | Type | Description | +|-------------------------------|------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | **Required.** OTLP backend host address. Defaults to `127.0.0.1`. | +| port | Number | **Required.** OTLP backend HTTP port. Defaults to `4318`. | +| metrics\_endpoint | String | **Required.** OTLP metrics endpoint path. Defaults to `/v1/metrics`. | +| service\_namespace | String | **Required.** The namespace to associate with emitted metrics used in the `service.namespace` OTel resource attribute. Defaults to `icinga`. | +| basic\_auth | Dictionary | **Optional.** Username and password for HTTP basic authentication. | +| flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to the OTLP backend. Defaults to `15s`. | +| flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `32MiB`. | +| enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `false`. | +| enable\_send\_thresholds | Boolean | **Optional.** Whether to stream warning, critical, minimum & maximum as separate metrics to the OTLP backend. Defaults to `false`. | +| diconnect\_timeout | Duration | **Optional.** Timeout to wait for any outstanding data to be flushed to the OTLP backend before disconnecting. Defaults to `10s`. | +| enable\_tls | Boolean | **Optional.** Whether to use a TLS stream. Defaults to `false`. | +| tls\_insecure\_noverify | Boolean | **Optional.** Disable TLS peer verification. Defaults to `false`. | +| tls\_ca\_file | String | **Optional.** Path to CA certificate to validate the remote host. | +| tls\_cert\_file | String | **Optional.** Path to the client certificate to present to the OTLP backend for mutual verification. | +| tls\_key\_file | String | **Optional.** Path to the client certificate key. | ### PerfdataWriter diff --git a/doc/14-features.md b/doc/14-features.md index c27bdc0ec80..4400675a254 100644 --- a/doc/14-features.md +++ b/doc/14-features.md @@ -73,6 +73,7 @@ best practice is to provide performance data. This data is parsed by features sending metrics to time series databases (TSDB): +* [OpenTelemetry](14-features.md#otlpmetrics-writer) * [Graphite](14-features.md#graphite-carbon-cache-writer) * [InfluxDB](14-features.md#influxdb-writer) * [OpenTSDB](14-features.md#opentsdb-writer) @@ -644,6 +645,254 @@ mechanism ensures that metrics are written even if the cluster fails. The recommended way of running OpenTSDB in this scenario is a dedicated server where you have OpenTSDB running. +### OTLPMetrics Writer + +The [OpenTelemetry Protocol (OTLP/HTTP)](https://opentelemetry.io/docs/specs/otlp/#otlphttp) metrics Writer feature +allows Icinga 2 to send metrics to OpenTelemetry Collector or any other backend that supports the OTLP HTTP protocol, +such as [Prometheus OTLP](https://prometheus.io/docs/guides/opentelemetry/) receiver, +[Grafana Mimir](https://grafana.com/docs/mimir/latest/configure/configure-otel-collector/), +[OpenSearch Data Prepper](https://docs.opensearch.org/latest/data-prepper/pipelines/configuration/sources/otlp-source/), +etc. It enables seamless integration of Icinga 2 metrics into modern observability stacks, allowing you to leverage the +capabilities of OpenTelemetry for advanced analysis and visualization of your monitoring data. OpenTelemetry provides a +standardized way to collect, process, and export telemetry data, making it easier to integrate with numerous +[monitoring and observability](https://opentelemetry.io/docs/collector/components/exporter/) tools effortlessly. + +!!! note + + This feature has successfully been tested with OpenTelemetry Collector, Prometheus OTLP receiver, OpenSearch Data + Prepper, and Grafana Mimir. However, it should work with any backend that supports the OTLP HTTP protocol as well. + +In order to enable this feature, you can use the following command: + +```bash +icinga2 feature enable otlpmetrics +``` + +By default, the OTLPMetrics Writer expects the OpenTelemetry Collector or any other OTLP HTTP receiver to listen at +`127.0.0.1` on port `4318` but most of the third-party backends use their own ports, so you may need to adjust the +configuration accordingly. Additionally, the `metrics_endpoint` can vary based on the backend you are using. +For example, OpenTelemetry Collector uses `/v1/metrics` by default, while the Prometheus OTLP receiver uses +`/api/v1/otlp/v1/metrics`. Therefore, it is important to set the correct `metrics_endpoint` in the configuration file. + +You can find more details about the configuration options [here](09-object-types.md#objecttype-otlpmetricswriter). + +The generated metric names follow the OpenTelemetry naming conventions and cannot be customized by end-users and are +therefore always the same across all Icinga 2 installations. The OTLP metrics writer currently sends the following metrics: + +| Metric Name | Description | +|-----------------------|----------------------------------------------------------------------| +| state_check.perfdata | Performance data metrics from checks. | +| state_check.threshold | Threshold values for perfdata metrics (warning, critical, min, max). | + +By default, the writer will not stream any data point for the `state_check.threshold` metric. To enable the streaming +of threshold metrics, you need to set the `enable_send_thresholds` option to `true` in the OTLPMetrics Writer +configuration. Once enabled, it will send the threshold values for each performance data metric if they are available +in the produced check results. + +The data points type for all the above metrics is [`gauge`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#gauge) +and the perfdata labels and their units (if available) are mapped to OpenTelemetry metric points attributes. For example, +a perfdata label `file_size` with a value of `42` and unit `B` will be sent to the `state_check.perfdata` metric stream, +with a metric point having a value of `42`, along with the attributes `perfdata_label="file_size"` and `unit="B"`. +Additionally, each metric point will also include other relevant attributes such as `icinga2.host.name`, `icinga2.service.name`, +`icinga2.command.name`, etc. as resource attributes. You can find the full list of metric point formats and attributes +in the [OTLPMetrics data format](#otlpmetrics-writer-data-format) section below. + +At the moment, the OTLPMetrics Writer allows you to configure only a single metrics resource attribute +[`service.namespace`](https://opentelemetry.io/docs/specs/semconv/registry/attributes/service/#service-namespace) via +the `service_namespace` option in the OTLPMetrics Writer config. This attribute can be used to group related metrics +together in the backend. By default, it is set to `icinga`. You can customize it to better fit your monitoring +environment. For example, you might set it to `production`, `staging`, or any other relevant namespace that categorizes +your Icinga 2 metrics emitted to the OpenTelemetry backend effectively. + +#### OTLPMetrics in HA Cluster Zones + +This writer supports [High Availability (HA)](06-distributed-monitoring.md#distributed-monitoring-high-availability-features) +cluster zones in Icinga 2. By default, the `enable_ha` option is set to `true` in the OTLPMetrics Writer config, which +means that only one writer in the cluster will be active at any given time, sending metrics to the configured OTLP backend. +The other OTLPMetrics Writer will remain in standby mode and ready to take over if the active endpoint fails or becomes +unavailable for any reason. However, due to how HA works in Icinga 2, the failover mechanism won't take place until the +two endpoints in the cluster lose connection with each other, and not just when the OTLPMetrics Writer fails. Therefore, +as long as the cluster connection is healthy, the other writer won't take over even if the active writer encounters some +issues connecting to the OTLP backend or sending metrics. + +In general, do not set `enable_ha` to `false` unless you have a specific use case that requires multiple OTLPMetrics +Writer instances to be active at the same time, sending metrics to different OTLP backends. In most cases, it is +recommended to keep `enable_ha` set to `true` to ensure that only one writer is active even in a non-HA cluster zone. + +#### OTLPMetrics Data Format + +The OTLPMetrics Writer sends metrics to the configured OTLP HTTP endpoint in the OpenTelemetry Protocol (OTLP) format. +The metric names and attributes follow the OpenTelemetry naming conventions. The `state_check.perfdata` metric includes +performance data metrics from checks, while the `state_check.threshold` metric is used to stream all threshold related +data points. In general, both metric streams share the same set of resource attributes, they only differ in the concrete +metric point attributes. Below is an example of the full data format for both metrics and can be used as a reference for +configuring your OTLP backend to properly receive and process the emitted metrics. + +```json +{ + "resourceMetrics": [ + { + "resource": { + "attributes": [ + { + "key": "service.name", + "value": { + "stringValue": "Icinga 2" + } + }, + { + "key": "service.instance.id", + "value": { + "stringValue": "9a1f9d6d58648f2274c539bbdd5f09388b68fc0a" + } + }, + { + "key": "service.version", + "value": { + "stringValue": "v2.15.0-285-g196ba8e9d" + } + }, + { + "key": "telemetry.sdk.language", + "value": { + "stringValue": "cpp" + } + }, + { + "key": "telemetry.sdk.name", + "value": { + "stringValue": "Icinga 2 OTel Integration" + } + }, + { + "key": "telemetry.sdk.version", + "value": { + "stringValue": "v2.15.0-285-g196ba8e9d" + } + }, + { + "key": "service.namespace", + "value": { + "stringValue": "icinga" + } + }, + { + "key": "icinga2.host.name", + "value": { + "stringValue": "something" + } + }, + { + "key": "icinga2.service.name", + "value": { + "stringValue": "something-service" + } + }, + { + "key": "icinga2.command.name", + "value": { + "stringValue": "icinga" + } + } + ], + "entityRefs": [ + { + "type": "service", + "idKeys": [ + "icinga2.host.name", + "icinga2.service.name" + ] + } + ] + }, + "scopeMetrics": [ + { + "scope": { + "name": "icinga2", + "version": "v2.15.0-285-g196ba8e9d" + }, + "metrics": [ + { + "name": "state_check.perfdata", + "gauge": { + "dataPoints": [ + { + "attributes": [ + { + "key": "perfdata_label", + "value": { + "stringValue": "some_perfdata_label" + } + } + ], + "startTimeUnixNano": "1770385516896651008", + "timeUnixNano": "1770385516896651008", + "asDouble": 1 + } + ] + } + }, + { + "name": "state_check.threshold", + "gauge": { + "dataPoints": [ + { + "attributes": [ + { + "key": "perfdata_label", + "value": { + "stringValue": "some_perfdata_label" + } + }, + { + "key": "threshold_type", + "value": { + "stringValue": "critical" + } + } + ], + "startTimeUnixNano": "1770385516896651008", + "timeUnixNano": "1770385516896651008", + "asDouble": 0 + }, + { + "attributes": [ + { + "key": "perfdata_label", + "value": { + "stringValue": "some_perfdata_label" + } + }, + { + "key": "threshold_type", + "value": { + "stringValue": "warning" + } + } + ], + "startTimeUnixNano": "1770385516896651008", + "timeUnixNano": "1770385516896651008", + "asDouble": 0 + } + ] + } + } + ], + "schemaUrl": "https://opentelemetry.io/schemas/1.39.0" + } + ], + "schemaUrl": "https://opentelemetry.io/schemas/1.39.0" + } + ] +} +``` + +As you can see in the above example, most of the attributes are resource attributes that are shared across all emitted +metrics. The only attributes that are specific to the OTLPMetrics Writer have `icinga2.` prefix like `icinga2.host.name` +etc. The `state_check.perfdata` metric has an additional attribute `perfdata_label` that corresponds to the perfdata +label of the emitted metric point value. Likewise, the `state_check.threshold` metric has two additional attributes +`perfdata_label` and `threshold_type` that correspond to the perfdata label they belong to and the threshold type +(warning, critical, min, max) respectively. ### Writing Performance Data Files From 8bdfba87722f1cd1d1ed9b3b69563ea76f648b1c Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 19 Feb 2026 11:47:32 +0100 Subject: [PATCH 09/20] Allow users to provide additional resource attributes --- doc/09-object-types.md | 2 + doc/14-features.md | 35 +++++++---- .../features-available/otlpmetrics.conf | 12 ++++ lib/perfdata/otlpmetricswriter.cpp | 61 ++++++++++++++++++- lib/perfdata/otlpmetricswriter.hpp | 4 ++ lib/perfdata/otlpmetricswriter.ti | 6 ++ 6 files changed, 108 insertions(+), 12 deletions(-) diff --git a/doc/09-object-types.md b/doc/09-object-types.md index 8bff4fb9a69..4cc45d6c31c 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -1898,6 +1898,8 @@ There are more configuration options available as described in the table below. | metrics\_endpoint | String | **Required.** OTLP metrics endpoint path. Defaults to `/v1/metrics`. | | service\_namespace | String | **Required.** The namespace to associate with emitted metrics used in the `service.namespace` OTel resource attribute. Defaults to `icinga`. | | basic\_auth | Dictionary | **Optional.** Username and password for HTTP basic authentication. | +| host\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with host metrics. Defaults to none. | +| service\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with service metrics. Defaults to none. | | flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to the OTLP backend. Defaults to `15s`. | | flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `32MiB`. | | enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `false`. | diff --git a/doc/14-features.md b/doc/14-features.md index 4400675a254..c8c85419473 100644 --- a/doc/14-features.md +++ b/doc/14-features.md @@ -657,11 +657,6 @@ capabilities of OpenTelemetry for advanced analysis and visualization of your mo standardized way to collect, process, and export telemetry data, making it easier to integrate with numerous [monitoring and observability](https://opentelemetry.io/docs/collector/components/exporter/) tools effortlessly. -!!! note - - This feature has successfully been tested with OpenTelemetry Collector, Prometheus OTLP receiver, OpenSearch Data - Prepper, and Grafana Mimir. However, it should work with any backend that supports the OTLP HTTP protocol as well. - In order to enable this feature, you can use the following command: ```bash @@ -697,12 +692,30 @@ Additionally, each metric point will also include other relevant attributes such `icinga2.command.name`, etc. as resource attributes. You can find the full list of metric point formats and attributes in the [OTLPMetrics data format](#otlpmetrics-writer-data-format) section below. -At the moment, the OTLPMetrics Writer allows you to configure only a single metrics resource attribute -[`service.namespace`](https://opentelemetry.io/docs/specs/semconv/registry/attributes/service/#service-namespace) via -the `service_namespace` option in the OTLPMetrics Writer config. This attribute can be used to group related metrics -together in the backend. By default, it is set to `icinga`. You can customize it to better fit your monitoring -environment. For example, you might set it to `production`, `staging`, or any other relevant namespace that categorizes -your Icinga 2 metrics emitted to the OpenTelemetry backend effectively. +In addition to the default attributes, it is also possible to configure custom resource attributes that are sent along +with the metrics to the OpenTelemetry backend. You can use the `host_resource_attributes` and `service_resource_attributes` +options in the OTLPMetrics Writer configuration to define custom resource attributes for host and service checks +respectively. You can use macros in the attribute values to dynamically populate them based on the check context. +For instance, you can add a custom resource attribute `host.os` with the value `$host.vars.os$` and it will be populated +with the value of `vars.os` for each host that has this variable defined, otherwise it will silently be ignored. +All custom resource attributes will be prefixed with `icinga2.custom.` to avoid naming conflicts with existing +OpenTelemetry and Icinga 2's built-in resource attributes. For example, if you define a custom resource attribute +`host.os`, it will be sent as `icinga2.custom.host.os` to OpenTelemetry. + +!!! warning + + Be cautious when defining custom resource attributes, as they are sent with every metric and can lead to high + cardinality issues if not used carefully. It is recommended to only define custom resource attributes that are + necessary for your monitoring use case and to avoid using attributes with high variability or a large number of + unique values. + +Apart from custom resource attributes, the OTLPMetrics Writer also allows you to configure an additional resource +attribute called [`service.namespace`](https://opentelemetry.io/docs/specs/semconv/registry/attributes/service/#service-namespace) +via the `service_namespace` option in the OTLPMetrics Writer configuration. This attribute is not specific to any host +or service but is a general attribute that applies to all metrics emitted by one OTLPMetrics Writer instance. +By default, it is set to `icinga`. You can customize it to better fit your monitoring environment. For example, you +might set it to `production`, `staging`, or any other relevant namespace that categorizes your Icinga 2 metrics emitted +to the OpenTelemetry backend effectively. #### OTLPMetrics in HA Cluster Zones diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf index 39a2cacb027..9a2e635ca0c 100644 --- a/etc/icinga2/features-available/otlpmetrics.conf +++ b/etc/icinga2/features-available/otlpmetrics.conf @@ -18,6 +18,18 @@ object OTLPMetricsWriter "otlp-metrics" { // password = "otel_password" // } + # You can also add custom tags to the exported metrics based on host and service variables. + # These tags will be included in the OTel metrics as resource attributes for hosts and services, respectively. + # By default, no additional tags are added. Adjust the templates as needed to include the desired variables. + // host_resource_attributes = { + // "host.vars.env" = "$host.vars.env$" + // "host.vars.os" = "$host.vars.os$" + // } + // service_resource_attributes = { + // "service.vars.env" = "$service.vars.env$" + // "service.vars.os" = "$service.vars.os$" + // } + # These are the default settings used by the OTel writer. Adjust them as needed. # Please refer to the documentation for more details on each option. // enable_ha = false diff --git a/lib/perfdata/otlpmetricswriter.cpp b/lib/perfdata/otlpmetricswriter.cpp index 2ad480c7127..269b261663c 100644 --- a/lib/perfdata/otlpmetricswriter.cpp +++ b/lib/perfdata/otlpmetricswriter.cpp @@ -11,6 +11,7 @@ #include "base/statsfunction.hpp" #include "icinga/checkable.hpp" #include "icinga/checkcommand.hpp" +#include "icinga/macroprocessor.hpp" #include "icinga/service.hpp" #include @@ -200,7 +201,7 @@ void OTLPMetricsWriter::CheckResultHandler(const Checkable::Ptr& checkable, cons if (auto unit = pdv->GetUnit(); !unit.IsEmpty()) { attrs.emplace("unit", std::move(unit)); } - AddBytesAndFlushIfNeeded(Record(checkable, l_PerfdataMetric, pdv->GetValue(), startTime, endTime, std::move(attrs))); + AddBytesAndFlushIfNeeded(Record(checkable, cr, l_PerfdataMetric, pdv->GetValue(), startTime, endTime, std::move(attrs))); if (GetEnableSendThresholds()) { std::array, 4> thresholds{{ @@ -218,6 +219,7 @@ void OTLPMetricsWriter::CheckResultHandler(const Checkable::Ptr& checkable, cons AddBytesAndFlushIfNeeded( Record( checkable, + cr, l_ThresholdMetric, Convert::ToDouble(threshold), startTime, @@ -279,6 +281,7 @@ void OTLPMetricsWriter::AddBytesAndFlushIfNeeded(std::size_t newBytes) * @tparam T The type of the data point to record (e.g., int64_t, double). * * @param checkable The configuration object to associate the metric with. + * @param cr The check result associated with the metric data point, used for macro resolution in attributes. * @param metric The OTel metric enum value indicating which metric stream to record the data point for. * @param value The data point value to record. * @param startTime The start time of the data point in seconds. @@ -290,6 +293,7 @@ void OTLPMetricsWriter::AddBytesAndFlushIfNeeded(std::size_t newBytes) template std::size_t OTLPMetricsWriter::Record( const Checkable::Ptr& checkable, + const CheckResult::Ptr& cr, std::string_view metric, T value, double startTime, @@ -327,6 +331,31 @@ std::size_t OTLPMetricsWriter::Record( } attr = resource->add_attributes(); OTel::SetAttribute(*attr, "icinga2.command.name"sv, checkable->GetCheckCommand()->GetName()); + + if (Dictionary::Ptr tmpl = service ? GetServiceResourceAttributes() : GetHostResourceAttributes(); tmpl) { + MacroProcessor::ResolverList resolvers{{"host", host}}; + if (service) { + resolvers.emplace_back("service", service); + } + + ObjectLock olock(tmpl); + for (const Dictionary::Pair& pair : tmpl) { + String missingMacro; + auto resolvedVal = MacroProcessor::ResolveMacros(pair.second, resolvers, cr, &missingMacro); + if (missingMacro.IsEmpty()) { + attr = resource->add_attributes(); + try { + OTel::SetAttribute(*attr, "icinga2.custom." + pair.first, resolvedVal); + } catch (const std::exception& ex) { + Log(LogWarning, "OTLPMetricsWriter") + << "Ignoring invalid resource attribute '" << pair.first << "' for checkable '" + << checkable->GetName() << "': " << ex.what(); + // Remove the last attribute from the list which is the one we just attempted to set. + resource->mutable_attributes()->RemoveLast(); + } + } + } + } bytes = resourceMetrics->ByteSizeLong(); } @@ -383,3 +412,33 @@ void OTLPMetricsWriter::ValidateFlushThreshold(const Lazy& lvalue, cons )); } } + +void OTLPMetricsWriter::ValidateHostResourceAttributes(const Lazy& lvalue, const ValidationUtils& utils) +{ + ObjectImpl::ValidateHostResourceAttributes(lvalue, utils); + if (const auto& tags{lvalue()}; tags) { + ValidateResourceAttributes(tags, "host_resource_attributes"); + } +} + +void OTLPMetricsWriter::ValidateServiceResourceAttributes(const Lazy& lvalue, const ValidationUtils& utils) +{ + ObjectImpl::ValidateServiceResourceAttributes(lvalue, utils); + if (const auto& tags{lvalue()}; tags) { + ValidateResourceAttributes(tags, "service_resource_attributes"); + } +} + +void OTLPMetricsWriter::ValidateResourceAttributes(const Dictionary::Ptr& tmpl, const String& attrName) +{ + ObjectLock olock(tmpl); + for (const auto& pair : tmpl) { + if (!MacroProcessor::ValidateMacroString(pair.second)) { + BOOST_THROW_EXCEPTION(ValidationError( + this, + {attrName, pair.first}, + "Closing $ not found in macro format string '" + pair.second + "'." + )); + } + } +} diff --git a/lib/perfdata/otlpmetricswriter.hpp b/lib/perfdata/otlpmetricswriter.hpp index a473e5798f6..2a7eee8f8b8 100644 --- a/lib/perfdata/otlpmetricswriter.hpp +++ b/lib/perfdata/otlpmetricswriter.hpp @@ -29,15 +29,19 @@ class OTLPMetricsWriter final : public ObjectImpl void ValidatePort(const Lazy& lvalue, const ValidationUtils& utils) override; void ValidateFlushInterval(const Lazy& lvalue, const ValidationUtils& utils) override; void ValidateFlushThreshold(const Lazy& lvalue, const ValidationUtils& utils) override; + void ValidateHostResourceAttributes(const Lazy& lvalue, const ValidationUtils& utils) override; + void ValidateServiceResourceAttributes(const Lazy& lvalue, const ValidationUtils& utils) override; private: void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr); void Flush(bool fromTimer = false); void AddBytesAndFlushIfNeeded(std::size_t newBytes = 0); + void ValidateResourceAttributes(const Dictionary::Ptr& tmpl, const String& attrName); template [[nodiscard]] std::size_t Record( const Checkable::Ptr& checkable, + const CheckResult::Ptr& cr, std::string_view metric, T value, double startTime, diff --git a/lib/perfdata/otlpmetricswriter.ti b/lib/perfdata/otlpmetricswriter.ti index 3423305a69e..214b12baaa5 100644 --- a/lib/perfdata/otlpmetricswriter.ti +++ b/lib/perfdata/otlpmetricswriter.ti @@ -28,6 +28,9 @@ class OTLPMetricsWriter : ConfigObject [config, no_user_view, no_user_modify] Dictionary::Ptr basic_auth; + [config] Dictionary::Ptr host_resource_attributes; + [config] Dictionary::Ptr service_resource_attributes; + [config] int flush_interval { default {{{ return 15; }}} }; @@ -63,6 +66,9 @@ validator OTLPMetricsWriter required password; String password; }; + + Dictionary host_resource_attributes { String "*"; }; + Dictionary service_resource_attributes { String "*"; }; }; } // namespace icinga From 61daf9b4590511a00f8ec242f311d0bf11329469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Aleksandrovi=C4=8D=20Klimov?= Date: Wed, 18 Feb 2026 16:30:45 +0100 Subject: [PATCH 10/20] Linux GHA: remove unnecessary `"${SCL_ENABLE_GCC[@]}"` --- .github/workflows/linux.bash | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux.bash b/.github/workflows/linux.bash index 516c1ed37d9..00e753a698b 100755 --- a/.github/workflows/linux.bash +++ b/.github/workflows/linux.bash @@ -155,8 +155,8 @@ cd /icinga2/build -DICINGA2_GROUP=$(id -gn) \ "${CMAKE_OPTS[@]}" .. -"${SCL_ENABLE_GCC[@]}" ninja -v +ninja -v -"${SCL_ENABLE_GCC[@]}" ninja test -"${SCL_ENABLE_GCC[@]}" ninja install +ninja test +ninja install icinga2 daemon -C From 8f36bdcddc6b25f5fc21439a19e48dc9aab886ec Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Thu, 19 Mar 2026 13:43:06 +0100 Subject: [PATCH 11/20] Replace `for` with a simpler `while` loop & fix a typo --- etc/icinga2/features-available/otlpmetrics.conf | 2 +- lib/otel/otel.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf index 9a2e635ca0c..a4ccf6031e6 100644 --- a/etc/icinga2/features-available/otlpmetrics.conf +++ b/etc/icinga2/features-available/otlpmetrics.conf @@ -46,7 +46,7 @@ object OTLPMetricsWriter "otlp-metrics" { # You can enable TLS encryption by uncommenting and configuring the following options. # By default, the OTel writer uses unencrypted connections (plain HTTP requests). // enable_tls = false - // tl_insecure_noverify = false + // tls_insecure_noverify = false // tls_ca_file = "/path/to/otel/ca.crt" // tls_cert_file = "/path/to/otel/client.crt" // tls_key_file = "/path/to/otel/client.key" diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 7d9c55765e0..53ab5ccae85 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -546,10 +546,10 @@ std::size_t OTel::Record(Gauge& gauge, T data, double start, double end, AttrsMa static_cast(ch::duration_cast(ch::duration(end)).count()) ); - for (auto it{attrs.begin()}; it != attrs.end(); /* NOPE */) { + while (!attrs.empty()) { auto* attr = dataPoint->add_attributes(); - auto node = attrs.extract(it++); - SetAttribute(*attr, node.key(), node.mapped()); + auto node = attrs.extract(attrs.begin()); + SetAttribute(*attr, std::move(node.key()), std::move(node.mapped())); } return dataPoint->ByteSizeLong(); } From 0718632f409f646f026492525f2f653861e7647c Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 19 Mar 2026 17:05:49 +0100 Subject: [PATCH 12/20] tests: fix `testbase` linker error --- test/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c20e0aaf54c..94296c38bd6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -144,6 +144,10 @@ if(ICINGA2_WITH_NOTIFICATION) ) endif() +if(ICINGA2_WITH_OPENTELEMETRY) + list(APPEND base_test_SOURCES $) +endif() + if(ICINGA2_WITH_PERFDATA) list(APPEND base_test_SOURCES perfdata-elasticsearchwriter.cpp From 3f68eea1fd10ae84c54dcd5e55b9d0a363255205 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 24 Mar 2026 12:07:42 +0100 Subject: [PATCH 13/20] Reduce default `flush_threshold` to `16MiB` So that it doesn't cause `request body too large` errors when used with the default OpenTelemetry Collector config that has `max_request_body_size` set to `20MiB`. --- doc/09-object-types.md | 11 ++++++++++- etc/icinga2/features-available/otlpmetrics.conf | 2 +- lib/perfdata/otlpmetricswriter.ti | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/09-object-types.md b/doc/09-object-types.md index 4cc45d6c31c..02d4cf9a2c2 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -1901,7 +1901,7 @@ There are more configuration options available as described in the table below. | host\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with host metrics. Defaults to none. | | service\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with service metrics. Defaults to none. | | flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to the OTLP backend. Defaults to `15s`. | -| flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `32MiB`. | +| flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `16MiB`. | | enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `false`. | | enable\_send\_thresholds | Boolean | **Optional.** Whether to stream warning, critical, minimum & maximum as separate metrics to the OTLP backend. Defaults to `false`. | | diconnect\_timeout | Duration | **Optional.** Timeout to wait for any outstanding data to be flushed to the OTLP backend before disconnecting. Defaults to `10s`. | @@ -1911,6 +1911,15 @@ There are more configuration options available as described in the table below. | tls\_cert\_file | String | **Optional.** Path to the client certificate to present to the OTLP backend for mutual verification. | | tls\_key\_file | String | **Optional.** Path to the client certificate key. | +!!! tip + + The `flush_threshold` is a byte size threshold, not a metric count threshold. By default, the writer will flush all + buffered metrics to the OTLP backend once the total size of buffered metrics exceeds 16 MiB. This number is chosen + based on the default `max_request_body_size` of the OpenTelemetry Collector, and you must adjust it according to the + `max_request_body_size` of your OTLP backend to avoid metrics being dropped due to exceeding the maximum request body + size. Furthermore, the writer may not flush at the exact byte size threshold due to the internal structure of OTLP + messages, so make sure that the threshold is lower than the configured `max_request_body_size` of your OTLP backend. + ### PerfdataWriter Writes check result performance data to a defined path using macro diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf index a4ccf6031e6..bd5af67a292 100644 --- a/etc/icinga2/features-available/otlpmetrics.conf +++ b/etc/icinga2/features-available/otlpmetrics.conf @@ -34,7 +34,7 @@ object OTLPMetricsWriter "otlp-metrics" { # Please refer to the documentation for more details on each option. // enable_ha = false // flush_interval = 15s - // flush_threshold = 32*1024*1024 + // flush_threshold = 16*1024*1024 # When stopping Icinga 2, this timeout defines how long to wait for any pending OTel # metrics to be sent before disconnecting and discarding them. // disconnect_timeout = 10s diff --git a/lib/perfdata/otlpmetricswriter.ti b/lib/perfdata/otlpmetricswriter.ti index 214b12baaa5..7e3ba9ea11f 100644 --- a/lib/perfdata/otlpmetricswriter.ti +++ b/lib/perfdata/otlpmetricswriter.ti @@ -35,7 +35,7 @@ class OTLPMetricsWriter : ConfigObject default {{{ return 15; }}} }; [config] int64_t flush_threshold { - default {{{ return 32 * 1024 * 1024; }}} + default {{{ return 16 * 1024 * 1024; }}} }; [config] bool enable_ha { default {{{ return false; }}} From e6c420e1060a5b10b45634ffe711ece11fb8efda Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 24 Mar 2026 12:33:06 +0100 Subject: [PATCH 14/20] OTLP: Set `enable_ha` to true by default --- doc/09-object-types.md | 2 +- etc/icinga2/features-available/otlpmetrics.conf | 2 +- lib/perfdata/otlpmetricswriter.ti | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/09-object-types.md b/doc/09-object-types.md index 02d4cf9a2c2..3fe29702f2d 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -1902,7 +1902,7 @@ There are more configuration options available as described in the table below. | service\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with service metrics. Defaults to none. | | flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to the OTLP backend. Defaults to `15s`. | | flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `16MiB`. | -| enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `false`. | +| enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `true`. | | enable\_send\_thresholds | Boolean | **Optional.** Whether to stream warning, critical, minimum & maximum as separate metrics to the OTLP backend. Defaults to `false`. | | diconnect\_timeout | Duration | **Optional.** Timeout to wait for any outstanding data to be flushed to the OTLP backend before disconnecting. Defaults to `10s`. | | enable\_tls | Boolean | **Optional.** Whether to use a TLS stream. Defaults to `false`. | diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf index bd5af67a292..6808c2015fd 100644 --- a/etc/icinga2/features-available/otlpmetrics.conf +++ b/etc/icinga2/features-available/otlpmetrics.conf @@ -32,7 +32,7 @@ object OTLPMetricsWriter "otlp-metrics" { # These are the default settings used by the OTel writer. Adjust them as needed. # Please refer to the documentation for more details on each option. - // enable_ha = false + // enable_ha = true // flush_interval = 15s // flush_threshold = 16*1024*1024 # When stopping Icinga 2, this timeout defines how long to wait for any pending OTel diff --git a/lib/perfdata/otlpmetricswriter.ti b/lib/perfdata/otlpmetricswriter.ti index 7e3ba9ea11f..f9f3ca8e38e 100644 --- a/lib/perfdata/otlpmetricswriter.ti +++ b/lib/perfdata/otlpmetricswriter.ti @@ -38,7 +38,7 @@ class OTLPMetricsWriter : ConfigObject default {{{ return 16 * 1024 * 1024; }}} }; [config] bool enable_ha { - default {{{ return false; }}} + default {{{ return true; }}} }; [config] bool enable_send_thresholds { default {{{ return false; }}} From 715aacc19ccddcbc77e1f357225c584a83171b75 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 31 Mar 2026 09:25:08 +0200 Subject: [PATCH 15/20] Don't manually include custom Protobuf dir via compiler flag Co-Authored-By: Johannes Schmidt --- .github/workflows/linux.bash | 5 +---- lib/otel/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linux.bash b/.github/workflows/linux.bash index 00e753a698b..296b6a88558 100755 --- a/.github/workflows/linux.bash +++ b/.github/workflows/linux.bash @@ -6,7 +6,6 @@ export CCACHE_DIR=/icinga2/ccache export CTEST_OUTPUT_ON_FAILURE=1 CMAKE_OPTS=() SCL_ENABLE_GCC=() -PROTOBUF_INCLUDE_DIR="" # -Wstringop-overflow is notorious for false positives and has been a problem for years. # See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88443 # -Wtemplate-id-cdtor leaks from using the generated headers. We should reenable this once @@ -100,8 +99,6 @@ gpgcheck=1 gpgkey=https://packages.icinga.com/icinga.key EOF dnf install -y icinga-protobuf - # And of course, make sure to add our custom Protobuf includes to the compiler include path. - PROTOBUF_INCLUDE_DIR="-isystem $(rpm -E '%{_includedir}')/icinga-protobuf" # Tell CMake where to find our own Protobuf CMake config files. CMAKE_OPTS+=(-DCMAKE_PREFIX_PATH="$(rpm -E '%{_libdir}')/icinga-protobuf/cmake") ;; @@ -138,7 +135,7 @@ case "$DISTRO" in if [ "$DISTRO" = "amazonlinux:2" ]; then CMAKE_OPTS+=(-DICINGA2_WITH_OPENTELEMETRY=OFF) fi - CMAKE_OPTS+=(-DCMAKE_{C,CXX}_FLAGS="$(rpm -E '%{optflags} %{?march_flag}') ${WARN_FLAGS} ${PROTOBUF_INCLUDE_DIR}") + CMAKE_OPTS+=(-DCMAKE_{C,CXX}_FLAGS="$(rpm -E '%{optflags} %{?march_flag}') ${WARN_FLAGS}") export LDFLAGS="$(rpm -E '%{?build_ldflags}')" ;; esac diff --git a/lib/otel/CMakeLists.txt b/lib/otel/CMakeLists.txt index dca82a91d86..a73cd57a64e 100644 --- a/lib/otel/CMakeLists.txt +++ b/lib/otel/CMakeLists.txt @@ -33,7 +33,7 @@ add_library(otel OBJECT ${otel_SOURCES}) add_dependencies(otel base remote) target_include_directories(otel SYSTEM PUBLIC - ${Protobuf_INCLUDE_DIRS} + $ ${CMAKE_CURRENT_BINARY_DIR} ) From 96c3364ab0783e7803ce7e71fd6e47508914e97b Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 31 Mar 2026 11:28:57 +0200 Subject: [PATCH 16/20] OTel: fix race condition triggered on Icinga 2 reload/shutdown Co-Authored-By: Julian Brost --- lib/otel/otel.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 53ab5ccae85..02d30fb6182 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -108,12 +108,20 @@ void OTel::Stop() } std::visit([this, &yc](auto& stream) { - { + // We only wait for ongoing export operations to complete if we're currently exporting, + // otherwise there will be nothing that would wake us up from the `WaitForClear` sleep + // below, and we would end up blocking indefinitely, so we have to check the exporting + // state here first. + if (Exporting()) { Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [&stream] { boost::system::error_code ec; stream->lowest_layer().cancel(ec); }); m_Export.WaitForClear(yc); + } else { + // This must be in cleared state when stopping, so we clear it just in case to avoid + // any potential issues with the export loop waiting on it after the next resume/start. + m_Export.Clear(); } using StreamType = std::decay_t; From 044f85ee76d3edec6d26b5cfdef27fcb72ddb36d Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 31 Mar 2026 14:36:36 +0200 Subject: [PATCH 17/20] OTel: do not perform graceful disconnect on I/O timeout --- lib/otel/otel.cpp | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 02d30fb6182..208972e94ac 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -107,33 +107,33 @@ void OTel::Stop() return; } - std::visit([this, &yc](auto& stream) { - // We only wait for ongoing export operations to complete if we're currently exporting, - // otherwise there will be nothing that would wake us up from the `WaitForClear` sleep - // below, and we would end up blocking indefinitely, so we have to check the exporting - // state here first. - if (Exporting()) { - Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [&stream] { - boost::system::error_code ec; - stream->lowest_layer().cancel(ec); - }); - m_Export.WaitForClear(yc); - } else { - // This must be in cleared state when stopping, so we clear it just in case to avoid - // any potential issues with the export loop waiting on it after the next resume/start. - m_Export.Clear(); - } + // We only wait for ongoing export operations to complete if we're currently exporting, + // otherwise there will be nothing that would wake us up from the `WaitForClear` sleep + // below, and we would end up blocking indefinitely, so we have to check the exporting + // state here first. + if (Exporting()) { + Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [this] { + boost::system::error_code ec; + std::visit([&ec](auto& stream) { stream->lowest_layer().cancel(ec); }, *m_Stream); + }); + m_Export.WaitForClear(yc); + } else { + // This must be in cleared state when stopping, so we clear it just in case to avoid + // any potential issues with the export loop waiting on it after the next resume/start. + m_Export.Clear(); + } - using StreamType = std::decay_t; - if constexpr (std::is_same_v::Ptr>) { - stream->GracefulDisconnect(m_Strand, yc); - } else { - static_assert(std::is_same_v::Ptr>, "Unknown stream type"); + // Check if the stream is still valid before attempting to disconnect, since the above lowest_layer.cancel() + // may have caused the export loop to detect a broken connection and reset the stream already. + if (m_Stream) { + if (auto* tlsStreamPtr = std::get_if::Ptr>(&*m_Stream); tlsStreamPtr) { + (*tlsStreamPtr)->GracefulDisconnect(m_Strand, yc); + } else if (auto* tcpStreamPtr = std::get_if::Ptr>(&*m_Stream); tcpStreamPtr) { boost::system::error_code ec; - stream->lowest_layer().shutdown(AsioTcpStream::lowest_layer_type::shutdown_both, ec); - stream->lowest_layer().close(ec); + (*tcpStreamPtr)->lowest_layer().shutdown(AsioTcpStream::lowest_layer_type::shutdown_both, ec); + (*tcpStreamPtr)->lowest_layer().close(ec); } - }, *m_Stream); + } Log(LogInformation, "OTelExporter") << "Disconnected from OpenTelemetry backend."; From 1139ba9b0decc257e78aa6a6a13d747d1f7059ef Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Wed, 1 Apr 2026 12:45:21 +0200 Subject: [PATCH 18/20] OTel: replace `AsioDualEvent` usage with `AsioConditionVariable` --- lib/otel/otel.cpp | 34 +++++++++++++++++++++------------- lib/otel/otel.hpp | 2 +- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 208972e94ac..588614cd8f3 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -63,7 +63,7 @@ OTel::OTel(OTelConnInfo& connInfo): OTel{connInfo, IoEngine::Get().GetIoContext( OTel::OTel(OTelConnInfo& connInfo, boost::asio::io_context& io) : m_ConnInfo{std::move(connInfo)}, m_Strand{io}, - m_Export{io}, + m_ExportAsioCV{io}, m_RetryExportAndConnTimer{io}, m_Exporting{false}, m_Stopped{false} @@ -99,7 +99,7 @@ void OTel::Stop() std::promise promise; IoEngine::SpawnCoroutine(m_Strand, [this, &promise, keepAlive = ConstPtr(this)](boost::asio::yield_context& yc) { - m_Export.Set(); + m_ExportAsioCV.NotifyAll(); // Wake up the export loop if it's waiting for new export requests. m_RetryExportAndConnTimer.cancel(); if (!m_Stream) { @@ -116,11 +116,9 @@ void OTel::Stop() boost::system::error_code ec; std::visit([&ec](auto& stream) { stream->lowest_layer().cancel(ec); }, *m_Stream); }); - m_Export.WaitForClear(yc); - } else { - // This must be in cleared state when stopping, so we clear it just in case to avoid - // any potential issues with the export loop waiting on it after the next resume/start. - m_Export.Clear(); + while (m_Request) { + m_ExportAsioCV.Wait(yc); + } } // Check if the stream is still valid before attempting to disconnect, since the above lowest_layer.cancel() @@ -139,7 +137,6 @@ void OTel::Stop() << "Disconnected from OpenTelemetry backend."; m_Stream.reset(); - m_Request.reset(); promise.set_value(); }); promise.get_future().wait(); @@ -172,7 +169,7 @@ void OTel::Export(std::unique_ptr&& request) // Access to m_Request is serialized via m_Strand, so we must post the actual export operation to it. boost::asio::post(m_Strand, [this, keepAlive = ConstPtr(this), request = std::move(request)]() mutable { m_Request = std::move(request); - m_Export.Set(); + m_ExportAsioCV.NotifyAll(); }); } @@ -293,14 +290,25 @@ void OTel::Connect(boost::asio::yield_context& yc) void OTel::ExportLoop(boost::asio::yield_context& yc) { Defer cleanup{[this] { - m_Export.Clear(); + m_Request.reset(); + m_ExportAsioCV.NotifyAll(); ResetExporting(true /* notify all */); }}; namespace ch = std::chrono; - while (!m_Stopped) { - m_Export.WaitForSet(yc); + while (true) { + // Wait for a new export request to be available. If the exporter is stopped while waiting, + // we will be notified without a new request, so we also check the stopped state here to + // avoid waiting indefinitely in that case. + while (!m_Request && !m_Stopped) { + m_ExportAsioCV.Wait(yc); + } + + if (m_Stopped) { + break; + } + if (!m_Stream) { Connect(yc); } @@ -309,7 +317,7 @@ void OTel::ExportLoop(boost::asio::yield_context& yc) try { ExportImpl(yc); m_Request.reset(); - m_Export.Clear(); + m_ExportAsioCV.NotifyAll(); ResetExporting(false /* notify one */); break; } catch (const RetryableExportError& ex) { diff --git a/lib/otel/otel.hpp b/lib/otel/otel.hpp index 44ef0df3ebc..eb45bedffb7 100644 --- a/lib/otel/otel.hpp +++ b/lib/otel/otel.hpp @@ -113,7 +113,7 @@ class OTel : public SharedObject Shared::Ptr m_TlsContext; boost::asio::io_context::strand m_Strand; - AsioDualEvent m_Export; // Event to signal when a new export request is available. + AsioConditionVariable m_ExportAsioCV; // Event to signal when a new export request is available. // Timer for scheduling retries of failed exports and reconnection attempts. boost::asio::steady_timer m_RetryExportAndConnTimer; From 465650262aa254ef0ebc8eebbfa0fa94d2271f76 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 1 Apr 2026 13:34:16 +0200 Subject: [PATCH 19/20] OTel: add connect & handshake timeout --- lib/otel/otel.cpp | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 588614cd8f3..0f1d695d58b 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -239,26 +239,37 @@ void OTel::Connect(boost::asio::yield_context& yc) for (uint64_t attempt = 1; !m_Stopped; ++attempt) { try { - boost::asio::ip::tcp::socket socket{m_Strand.context()}; - icinga::Connect(socket, m_ConnInfo.Host, std::to_string(m_ConnInfo.Port), yc); - + decltype(m_Stream) stream; if (m_ConnInfo.EnableTls) { - auto tlsStream = Shared::Make(m_Strand.context(), *m_TlsContext, m_ConnInfo.Host); - tlsStream->lowest_layer() = std::move(socket); - tlsStream->next_layer().async_handshake(AsioTlsStream::next_layer_type::client, yc); - - if (m_ConnInfo.VerifyPeerCertificate && !tlsStream->next_layer().IsVerifyOK()) { - BOOST_THROW_EXCEPTION(std::runtime_error( - "TLS certificate validation failed: " + tlsStream->next_layer().GetVerifyError() - )); - } - m_Stream = std::move(tlsStream); + stream = Shared::Make(m_Strand.context(), *m_TlsContext, m_ConnInfo.Host); } else { - auto tcpStream = Shared::Make(m_Strand.context()); - tcpStream->lowest_layer() = std::move(socket); - m_Stream = std::move(tcpStream); + stream = Shared::Make(m_Strand.context()); } + Timeout timeout{m_Strand, boost::posix_time::seconds(10), [this, stream] { + Log(LogCritical, "OTelExporter") + << "Timeout while connecting to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "', cancelling attempt."; + + boost::system::error_code ec; + std::visit([&ec](auto& s) { s->lowest_layer().cancel(ec); }, *stream); + }}; + + std::visit([this, &yc](auto& streamArg) { + icinga::Connect(streamArg->lowest_layer(), m_ConnInfo.Host, std::to_string(m_ConnInfo.Port), yc); + + if constexpr (std::is_same_v, Shared::Ptr>) { + streamArg->next_layer().async_handshake(AsioTlsStream::next_layer_type::client, yc); + + if (m_ConnInfo.VerifyPeerCertificate && !streamArg->next_layer().IsVerifyOK()) { + BOOST_THROW_EXCEPTION(std::runtime_error( + "TLS certificate validation failed: " + streamArg->next_layer().GetVerifyError() + )); + } + } + }, *stream); + + m_Stream = std::move(stream); + Log(LogInformation, "OTelExporter") << "Successfully connected to OpenTelemetry backend."; return; From 4dbf782e4eae42cc8fdc06ed2a45534b925637dc Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 1 Apr 2026 13:34:47 +0200 Subject: [PATCH 20/20] OTel: raise runtime error when failing to fully serialize Protobuf request --- lib/otel/otel.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index 0f1d695d58b..f2a0d6acaec 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -369,7 +369,9 @@ void OTel::ExportImpl(boost::asio::yield_context& yc) const [[maybe_unused]] auto serialized = m_Request->SerializeToZeroCopyStream(&outputS); ASSERT(serialized); // Must have completed chunk writing successfully, otherwise reading the response will hang forever. - VERIFY(outputS.WriterDone()); + if (!outputS.WriterDone()) { + BOOST_THROW_EXCEPTION(std::runtime_error("BUG: Protobuf output stream writer did not complete successfully.")); + } IncomingHttpResponse responseMsg{*m_Stream}; responseMsg.Parse(yc);