diff --git a/.github/workflows/linux.bash b/.github/workflows/linux.bash
index dffe4c45664..296b6a88558 100755
--- a/.github/workflows/linux.bash
+++ b/.github/workflows/linux.bash
@@ -5,6 +5,7 @@ export PATH="/usr/lib/ccache/bin:/usr/lib/ccache:/usr/lib64/ccache:$PATH"
export CCACHE_DIR=/icinga2/ccache
export CTEST_OUTPUT_ON_FAILURE=1
CMAKE_OPTS=()
+SCL_ENABLE_GCC=()
# -Wstringop-overflow is notorious for false positives and has been a problem for years.
# See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88443
# -Wtemplate-id-cdtor leaks from using the generated headers. We should reenable this once
@@ -17,7 +18,7 @@ case "$DISTRO" in
# - LibreSSL instead of OpenSSL 3 and
# - no MariaDB or libpq as they depend on OpenSSL.
# https://gitlab.alpinelinux.org/alpine/aports/-/blob/master/community/icinga2/APKBUILD
- apk add bison boost-dev ccache cmake flex g++ libedit-dev libressl-dev ninja-build tzdata
+ apk add bison boost-dev ccache cmake flex g++ libedit-dev libressl-dev ninja-build tzdata protobuf-dev
ln -vs /usr/lib/ninja-build/bin/ninja /usr/local/bin/ninja
;;
@@ -44,24 +45,24 @@ case "$DISTRO" in
amazonlinux:20*)
dnf install -y amazon-rpm-config bison cmake flex gcc-c++ ninja-build \
- {boost,libedit,mariadb-connector-c,ncurses,openssl,postgresql,systemd}-devel
+ {boost,libedit,mariadb-connector-c,ncurses,openssl,postgresql,systemd,protobuf-lite}-devel
;;
debian:*|ubuntu:*)
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install --no-install-{recommends,suggests} -y \
- bison ccache cmake dpkg-dev flex g++ ninja-build tzdata \
- lib{boost-all,edit,mariadb,ncurses,pq,ssl,systemd}-dev
+ bison ccache cmake dpkg-dev flex g++ ninja-build tzdata protobuf-compiler \
+ lib{boost-all,edit,mariadb,ncurses,pq,ssl,systemd,protobuf}-dev
;;
fedora:*)
dnf install -y bison ccache cmake flex gcc-c++ ninja-build redhat-rpm-config \
- {boost,libedit,mariadb,ncurses,openssl,postgresql,systemd}-devel
+ {boost,libedit,mariadb,ncurses,openssl,postgresql,systemd,protobuf-lite}-devel
;;
*suse*)
zypper in -y bison ccache cmake flex gcc-c++ ninja rpm-config-SUSE \
- {lib{edit,mariadb,openssl},ncurses,postgresql,systemd}-devel \
+ {lib{edit,mariadb,openssl},ncurses,postgresql,systemd,protobuf}-devel \
libboost_{context,coroutine,filesystem,iostreams,program_options,regex,system,test,thread}-devel
;;
@@ -71,6 +72,10 @@ case "$DISTRO" in
case "$DISTRO" in
*:8)
dnf config-manager --enable powertools
+ # Our Protobuf package on RHEL 8 is built with GCC 13, and since the ABI is not compatible with GCC 8,
+ # we need to enable the SCL repository and install the GCC 13 packages to be able to link against it.
+ SCL_ENABLE_GCC=(scl enable gcc-toolset-13 --)
+ dnf install -y gcc-toolset-13-gcc-c++ gcc-toolset-13-annobin-plugin-gcc
;;
*)
dnf config-manager --enable crb
@@ -79,6 +84,27 @@ case "$DISTRO" in
dnf install -y bison ccache cmake gcc-c++ flex ninja-build redhat-rpm-config \
{boost,bzip2,libedit,mariadb,ncurses,openssl,postgresql,systemd,xz,libzstd}-devel
+
+ # Rocky Linux 8 and 9 don't have a recent enough Protobuf compiler for OTel, so we need to add
+ # our repository to install the pre-built Protobuf devel package.
+ case "$DISTRO" in
+ *:[8-9])
+ rpm --import https://packages.icinga.com/icinga.key
+ cat > /etc/yum.repos.d/icinga-build-deps.repo <<'EOF'
+[icinga-build-deps]
+name=Icinga Build Dependencies
+baseurl=https://packages.icinga.com/build-dependencies/rhel/$releasever/release
+enabled=1
+gpgcheck=1
+gpgkey=https://packages.icinga.com/icinga.key
+EOF
+ dnf install -y icinga-protobuf
+ # Tell CMake where to find our own Protobuf CMake config files.
+ CMAKE_OPTS+=(-DCMAKE_PREFIX_PATH="$(rpm -E '%{_libdir}')/icinga-protobuf/cmake")
+ ;;
+ *)
+ dnf install -y protobuf-lite-devel
+ esac
;;
esac
@@ -96,8 +122,19 @@ case "$DISTRO" in
source <(dpkg-buildflags --export=sh)
export CFLAGS="${CFLAGS} ${WARN_FLAGS}"
export CXXFLAGS="${CXXFLAGS} ${WARN_FLAGS}"
+
+ # The default Protobuf compiler is too old for OTel, so we need to turn it off on Debian 11 and Ubuntu 22.04.
+ case "$DISTRO" in
+ debian:11|ubuntu:22.04)
+ CMAKE_OPTS+=(-DICINGA2_WITH_OPENTELEMETRY=OFF)
+ ;;
+ esac
;;
*)
+ # Turn off with OTel on Amazon Linux 2 as the default Protobuf compiler is way too old.
+ if [ "$DISTRO" = "amazonlinux:2" ]; then
+ CMAKE_OPTS+=(-DICINGA2_WITH_OPENTELEMETRY=OFF)
+ fi
CMAKE_OPTS+=(-DCMAKE_{C,CXX}_FLAGS="$(rpm -E '%{optflags} %{?march_flag}') ${WARN_FLAGS}")
export LDFLAGS="$(rpm -E '%{?build_ldflags}')"
;;
@@ -106,7 +143,7 @@ esac
mkdir /icinga2/build
cd /icinga2/build
-cmake \
+"${SCL_ENABLE_GCC[@]}" cmake \
-GNinja \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DICINGA2_UNITY_BUILD=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ecdba05413..96cfedab486 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ option(ICINGA2_WITH_LIVESTATUS "Build the Livestatus module" ${ICINGA2_MASTER})
option(ICINGA2_WITH_NOTIFICATION "Build the notification module" ON)
option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ${ICINGA2_MASTER})
option(ICINGA2_WITH_ICINGADB "Build the IcingaDB module" ${ICINGA2_MASTER})
+option(ICINGA2_WITH_OPENTELEMETRY "Build the OpenTelemetry integration module" ${ICINGA2_MASTER})
option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
@@ -207,6 +208,23 @@ set(HAVE_EDITLINE "${EDITLINE_FOUND}")
find_package(Termcap)
set(HAVE_TERMCAP "${TERMCAP_FOUND}")
+if(ICINGA2_WITH_OPENTELEMETRY)
+ # Newer Protobuf versions provide a CMake config package that we should prefer, since it implicitly
+ # links against all its dependencies (like absl, etc.) that would otherwise need to be linked manually.
+ # Thus, first try to find Protobuf in config mode and only fall back to module mode if that fails.
+ find_package(Protobuf CONFIG)
+ if(NOT Protobuf_FOUND)
+ # FindProtobuf.cmake in CMake versions < 3.31.0 is just broken and mixes up the Protobuf output directories
+ # and it doesn't even support to pass any PLUGIN_OPTIONS like "lite" to the protobuf_generate() function in
+ # order to generate code for the lite runtime without having to modify the proto files directly.
+ if(CMAKE_VERSION VERSION_LESS 3.31.0)
+ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/third-party/cmake/protobuf")
+ endif()
+ find_package(Protobuf REQUIRED)
+ endif()
+ list(APPEND base_DEPS protobuf::libprotobuf-lite)
+endif()
+
include_directories(
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/lib
${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/lib
diff --git a/Containerfile b/Containerfile
index 3312c0212cd..ad3e2302b8a 100644
--- a/Containerfile
+++ b/Containerfile
@@ -38,6 +38,8 @@ RUN apt-get update && \
libpq-dev \
libssl-dev \
libsystemd-dev \
+ libprotobuf-dev \
+ protobuf-compiler \
make && \
rm -rf /var/lib/apt/lists/*
@@ -165,6 +167,7 @@ RUN apt-get update && \
libmariadb3 \
libmoosex-role-timer-perl \
libpq5 \
+ libprotobuf-lite32t64 \
libssl3 \
libsystemd0 \
mailutils \
diff --git a/doc/06-distributed-monitoring.md b/doc/06-distributed-monitoring.md
index d38b83612c0..e03f22bcc68 100644
--- a/doc/06-distributed-monitoring.md
+++ b/doc/06-distributed-monitoring.md
@@ -2959,6 +2959,7 @@ By default, the following features provide advanced HA functionality:
* [Graphite](09-object-types.md#objecttype-graphitewriter)
* [InfluxDB](09-object-types.md#objecttype-influxdb2writer) (v1 and v2)
* [OpenTsdb](09-object-types.md#objecttype-opentsdbwriter)
+* [OTLPMetrics](09-object-types.md#objecttype-otlpmetricswriter)
* [Perfdata](09-object-types.md#objecttype-perfdatawriter) (for PNP)
#### High-Availability with Checks
diff --git a/doc/09-object-types.md b/doc/09-object-types.md
index 4caf327dee6..3fe29702f2d 100644
--- a/doc/09-object-types.md
+++ b/doc/09-object-types.md
@@ -1871,6 +1871,54 @@ Configuration Attributes:
host_template | Dictionary | **Optional.** Specify additional tags to be included with host metrics. This requires a sub-dictionary named `tags`. Also specify a naming prefix by setting `metric`. More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags) and [OpenTSDB Metric Prefix](14-features.md#opentsdb-metric-prefix). More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags). Defaults to an `empty Dictionary`.
service_template | Dictionary | **Optional.** Specify additional tags to be included with service metrics. This requires a sub-dictionary named `tags`. Also specify a naming prefix by setting `metric`. More information can be found in [OpenTSDB custom tags](14-features.md#opentsdb-custom-tags) and [OpenTSDB Metric Prefix](14-features.md#opentsdb-metric-prefix). Defaults to an `empty Dictionary`.
+### OTLPMetricsWriter
+
+Emits metrics in [OpenTelemetry Protocol (OTLP)](https://opentelemetry.io/) format to a defined OpenTelemetry Collector
+or any other OTLP-compatible backend that accepts OTLP data over HTTP. This configuration object is available as
+[otlpmetrics feature](14-features.md#otlpmetrics-writer). You can find more information about OpenTelemetry and OTLP
+on the [OpenTelemetry website](https://opentelemetry.io/).
+
+A basic copy and pastable example configuration is shown below:
+
+```
+object OTLPMetricsWriter "otlp-metrics" {
+ host = "127.0.0.1"
+ port = 4318
+ metrics_endpoint = "/v1/metrics"
+ service_namespace = "icinga2-production"
+}
+```
+
+There are more configuration options available as described in the table below.
+
+| Name | Type | Description |
+|-------------------------------|------------|----------------------------------------------------------------------------------------------------------------------------------------------|
+| host | String | **Required.** OTLP backend host address. Defaults to `127.0.0.1`. |
+| port | Number | **Required.** OTLP backend HTTP port. Defaults to `4318`. |
+| metrics\_endpoint | String | **Required.** OTLP metrics endpoint path. Defaults to `/v1/metrics`. |
+| service\_namespace | String | **Required.** The namespace to associate with emitted metrics used in the `service.namespace` OTel resource attribute. Defaults to `icinga`. |
+| basic\_auth | Dictionary | **Optional.** Username and password for HTTP basic authentication. |
+| host\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with host metrics. Defaults to none. |
+| service\_resource\_attributes | Dictionary | **Optional.** Additional resource attributes to be included with service metrics. Defaults to none. |
+| flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to the OTLP backend. Defaults to `15s`. |
+| flush\_threshold | Number | **Optional.** How many bytes to buffer before forcing a transfer to the OTLP backend. Defaults to `16MiB`. |
+| enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Has no effect in non-cluster setups. Defaults to `true`. |
+| enable\_send\_thresholds | Boolean | **Optional.** Whether to stream warning, critical, minimum & maximum as separate metrics to the OTLP backend. Defaults to `false`. |
+| diconnect\_timeout | Duration | **Optional.** Timeout to wait for any outstanding data to be flushed to the OTLP backend before disconnecting. Defaults to `10s`. |
+| enable\_tls | Boolean | **Optional.** Whether to use a TLS stream. Defaults to `false`. |
+| tls\_insecure\_noverify | Boolean | **Optional.** Disable TLS peer verification. Defaults to `false`. |
+| tls\_ca\_file | String | **Optional.** Path to CA certificate to validate the remote host. |
+| tls\_cert\_file | String | **Optional.** Path to the client certificate to present to the OTLP backend for mutual verification. |
+| tls\_key\_file | String | **Optional.** Path to the client certificate key. |
+
+!!! tip
+
+ The `flush_threshold` is a byte size threshold, not a metric count threshold. By default, the writer will flush all
+ buffered metrics to the OTLP backend once the total size of buffered metrics exceeds 16 MiB. This number is chosen
+ based on the default `max_request_body_size` of the OpenTelemetry Collector, and you must adjust it according to the
+ `max_request_body_size` of your OTLP backend to avoid metrics being dropped due to exceeding the maximum request body
+ size. Furthermore, the writer may not flush at the exact byte size threshold due to the internal structure of OTLP
+ messages, so make sure that the threshold is lower than the configured `max_request_body_size` of your OTLP backend.
### PerfdataWriter
diff --git a/doc/14-features.md b/doc/14-features.md
index c27bdc0ec80..c8c85419473 100644
--- a/doc/14-features.md
+++ b/doc/14-features.md
@@ -73,6 +73,7 @@ best practice is to provide performance data.
This data is parsed by features sending metrics to time series databases (TSDB):
+* [OpenTelemetry](14-features.md#otlpmetrics-writer)
* [Graphite](14-features.md#graphite-carbon-cache-writer)
* [InfluxDB](14-features.md#influxdb-writer)
* [OpenTSDB](14-features.md#opentsdb-writer)
@@ -644,6 +645,267 @@ mechanism ensures that metrics are written even if the cluster fails.
The recommended way of running OpenTSDB in this scenario is a dedicated server
where you have OpenTSDB running.
+### OTLPMetrics Writer
+
+The [OpenTelemetry Protocol (OTLP/HTTP)](https://opentelemetry.io/docs/specs/otlp/#otlphttp) metrics Writer feature
+allows Icinga 2 to send metrics to OpenTelemetry Collector or any other backend that supports the OTLP HTTP protocol,
+such as [Prometheus OTLP](https://prometheus.io/docs/guides/opentelemetry/) receiver,
+[Grafana Mimir](https://grafana.com/docs/mimir/latest/configure/configure-otel-collector/),
+[OpenSearch Data Prepper](https://docs.opensearch.org/latest/data-prepper/pipelines/configuration/sources/otlp-source/),
+etc. It enables seamless integration of Icinga 2 metrics into modern observability stacks, allowing you to leverage the
+capabilities of OpenTelemetry for advanced analysis and visualization of your monitoring data. OpenTelemetry provides a
+standardized way to collect, process, and export telemetry data, making it easier to integrate with numerous
+[monitoring and observability](https://opentelemetry.io/docs/collector/components/exporter/) tools effortlessly.
+
+In order to enable this feature, you can use the following command:
+
+```bash
+icinga2 feature enable otlpmetrics
+```
+
+By default, the OTLPMetrics Writer expects the OpenTelemetry Collector or any other OTLP HTTP receiver to listen at
+`127.0.0.1` on port `4318` but most of the third-party backends use their own ports, so you may need to adjust the
+configuration accordingly. Additionally, the `metrics_endpoint` can vary based on the backend you are using.
+For example, OpenTelemetry Collector uses `/v1/metrics` by default, while the Prometheus OTLP receiver uses
+`/api/v1/otlp/v1/metrics`. Therefore, it is important to set the correct `metrics_endpoint` in the configuration file.
+
+You can find more details about the configuration options [here](09-object-types.md#objecttype-otlpmetricswriter).
+
+The generated metric names follow the OpenTelemetry naming conventions and cannot be customized by end-users and are
+therefore always the same across all Icinga 2 installations. The OTLP metrics writer currently sends the following metrics:
+
+| Metric Name | Description |
+|-----------------------|----------------------------------------------------------------------|
+| state_check.perfdata | Performance data metrics from checks. |
+| state_check.threshold | Threshold values for perfdata metrics (warning, critical, min, max). |
+
+By default, the writer will not stream any data point for the `state_check.threshold` metric. To enable the streaming
+of threshold metrics, you need to set the `enable_send_thresholds` option to `true` in the OTLPMetrics Writer
+configuration. Once enabled, it will send the threshold values for each performance data metric if they are available
+in the produced check results.
+
+The data points type for all the above metrics is [`gauge`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#gauge)
+and the perfdata labels and their units (if available) are mapped to OpenTelemetry metric points attributes. For example,
+a perfdata label `file_size` with a value of `42` and unit `B` will be sent to the `state_check.perfdata` metric stream,
+with a metric point having a value of `42`, along with the attributes `perfdata_label="file_size"` and `unit="B"`.
+Additionally, each metric point will also include other relevant attributes such as `icinga2.host.name`, `icinga2.service.name`,
+`icinga2.command.name`, etc. as resource attributes. You can find the full list of metric point formats and attributes
+in the [OTLPMetrics data format](#otlpmetrics-writer-data-format) section below.
+
+In addition to the default attributes, it is also possible to configure custom resource attributes that are sent along
+with the metrics to the OpenTelemetry backend. You can use the `host_resource_attributes` and `service_resource_attributes`
+options in the OTLPMetrics Writer configuration to define custom resource attributes for host and service checks
+respectively. You can use macros in the attribute values to dynamically populate them based on the check context.
+For instance, you can add a custom resource attribute `host.os` with the value `$host.vars.os$` and it will be populated
+with the value of `vars.os` for each host that has this variable defined, otherwise it will silently be ignored.
+All custom resource attributes will be prefixed with `icinga2.custom.` to avoid naming conflicts with existing
+OpenTelemetry and Icinga 2's built-in resource attributes. For example, if you define a custom resource attribute
+`host.os`, it will be sent as `icinga2.custom.host.os` to OpenTelemetry.
+
+!!! warning
+
+ Be cautious when defining custom resource attributes, as they are sent with every metric and can lead to high
+ cardinality issues if not used carefully. It is recommended to only define custom resource attributes that are
+ necessary for your monitoring use case and to avoid using attributes with high variability or a large number of
+ unique values.
+
+Apart from custom resource attributes, the OTLPMetrics Writer also allows you to configure an additional resource
+attribute called [`service.namespace`](https://opentelemetry.io/docs/specs/semconv/registry/attributes/service/#service-namespace)
+via the `service_namespace` option in the OTLPMetrics Writer configuration. This attribute is not specific to any host
+or service but is a general attribute that applies to all metrics emitted by one OTLPMetrics Writer instance.
+By default, it is set to `icinga`. You can customize it to better fit your monitoring environment. For example, you
+might set it to `production`, `staging`, or any other relevant namespace that categorizes your Icinga 2 metrics emitted
+to the OpenTelemetry backend effectively.
+
+#### OTLPMetrics in HA Cluster Zones
+
+This writer supports [High Availability (HA)](06-distributed-monitoring.md#distributed-monitoring-high-availability-features)
+cluster zones in Icinga 2. By default, the `enable_ha` option is set to `true` in the OTLPMetrics Writer config, which
+means that only one writer in the cluster will be active at any given time, sending metrics to the configured OTLP backend.
+The other OTLPMetrics Writer will remain in standby mode and ready to take over if the active endpoint fails or becomes
+unavailable for any reason. However, due to how HA works in Icinga 2, the failover mechanism won't take place until the
+two endpoints in the cluster lose connection with each other, and not just when the OTLPMetrics Writer fails. Therefore,
+as long as the cluster connection is healthy, the other writer won't take over even if the active writer encounters some
+issues connecting to the OTLP backend or sending metrics.
+
+In general, do not set `enable_ha` to `false` unless you have a specific use case that requires multiple OTLPMetrics
+Writer instances to be active at the same time, sending metrics to different OTLP backends. In most cases, it is
+recommended to keep `enable_ha` set to `true` to ensure that only one writer is active even in a non-HA cluster zone.
+
+#### OTLPMetrics Data Format
+
+The OTLPMetrics Writer sends metrics to the configured OTLP HTTP endpoint in the OpenTelemetry Protocol (OTLP) format.
+The metric names and attributes follow the OpenTelemetry naming conventions. The `state_check.perfdata` metric includes
+performance data metrics from checks, while the `state_check.threshold` metric is used to stream all threshold related
+data points. In general, both metric streams share the same set of resource attributes, they only differ in the concrete
+metric point attributes. Below is an example of the full data format for both metrics and can be used as a reference for
+configuring your OTLP backend to properly receive and process the emitted metrics.
+
+```json
+{
+ "resourceMetrics": [
+ {
+ "resource": {
+ "attributes": [
+ {
+ "key": "service.name",
+ "value": {
+ "stringValue": "Icinga 2"
+ }
+ },
+ {
+ "key": "service.instance.id",
+ "value": {
+ "stringValue": "9a1f9d6d58648f2274c539bbdd5f09388b68fc0a"
+ }
+ },
+ {
+ "key": "service.version",
+ "value": {
+ "stringValue": "v2.15.0-285-g196ba8e9d"
+ }
+ },
+ {
+ "key": "telemetry.sdk.language",
+ "value": {
+ "stringValue": "cpp"
+ }
+ },
+ {
+ "key": "telemetry.sdk.name",
+ "value": {
+ "stringValue": "Icinga 2 OTel Integration"
+ }
+ },
+ {
+ "key": "telemetry.sdk.version",
+ "value": {
+ "stringValue": "v2.15.0-285-g196ba8e9d"
+ }
+ },
+ {
+ "key": "service.namespace",
+ "value": {
+ "stringValue": "icinga"
+ }
+ },
+ {
+ "key": "icinga2.host.name",
+ "value": {
+ "stringValue": "something"
+ }
+ },
+ {
+ "key": "icinga2.service.name",
+ "value": {
+ "stringValue": "something-service"
+ }
+ },
+ {
+ "key": "icinga2.command.name",
+ "value": {
+ "stringValue": "icinga"
+ }
+ }
+ ],
+ "entityRefs": [
+ {
+ "type": "service",
+ "idKeys": [
+ "icinga2.host.name",
+ "icinga2.service.name"
+ ]
+ }
+ ]
+ },
+ "scopeMetrics": [
+ {
+ "scope": {
+ "name": "icinga2",
+ "version": "v2.15.0-285-g196ba8e9d"
+ },
+ "metrics": [
+ {
+ "name": "state_check.perfdata",
+ "gauge": {
+ "dataPoints": [
+ {
+ "attributes": [
+ {
+ "key": "perfdata_label",
+ "value": {
+ "stringValue": "some_perfdata_label"
+ }
+ }
+ ],
+ "startTimeUnixNano": "1770385516896651008",
+ "timeUnixNano": "1770385516896651008",
+ "asDouble": 1
+ }
+ ]
+ }
+ },
+ {
+ "name": "state_check.threshold",
+ "gauge": {
+ "dataPoints": [
+ {
+ "attributes": [
+ {
+ "key": "perfdata_label",
+ "value": {
+ "stringValue": "some_perfdata_label"
+ }
+ },
+ {
+ "key": "threshold_type",
+ "value": {
+ "stringValue": "critical"
+ }
+ }
+ ],
+ "startTimeUnixNano": "1770385516896651008",
+ "timeUnixNano": "1770385516896651008",
+ "asDouble": 0
+ },
+ {
+ "attributes": [
+ {
+ "key": "perfdata_label",
+ "value": {
+ "stringValue": "some_perfdata_label"
+ }
+ },
+ {
+ "key": "threshold_type",
+ "value": {
+ "stringValue": "warning"
+ }
+ }
+ ],
+ "startTimeUnixNano": "1770385516896651008",
+ "timeUnixNano": "1770385516896651008",
+ "asDouble": 0
+ }
+ ]
+ }
+ }
+ ],
+ "schemaUrl": "https://opentelemetry.io/schemas/1.39.0"
+ }
+ ],
+ "schemaUrl": "https://opentelemetry.io/schemas/1.39.0"
+ }
+ ]
+}
+```
+
+As you can see in the above example, most of the attributes are resource attributes that are shared across all emitted
+metrics. The only attributes that are specific to the OTLPMetrics Writer have `icinga2.` prefix like `icinga2.host.name`
+etc. The `state_check.perfdata` metric has an additional attribute `perfdata_label` that corresponds to the perfdata
+label of the emitted metric point value. Likewise, the `state_check.threshold` metric has two additional attributes
+`perfdata_label` and `threshold_type` that correspond to the perfdata label they belong to and the threshold type
+(warning, critical, min, max) respectively.
### Writing Performance Data Files
diff --git a/etc/icinga2/features-available/otlpmetrics.conf b/etc/icinga2/features-available/otlpmetrics.conf
new file mode 100644
index 00000000000..6808c2015fd
--- /dev/null
+++ b/etc/icinga2/features-available/otlpmetrics.conf
@@ -0,0 +1,53 @@
+/**
+ * The OpenTelemetry Metrics Writer feature allows Icinga 2 to export metrics from performance
+ * data to an OpenTelemetry Collector or compatible backend.
+ *
+ * For more information, see the official documentation:
+ * https://icinga.com/docs/icinga-2/latest/doc/14-features/#otlpmetrics-writer
+ */
+object OTLPMetricsWriter "otlp-metrics" {
+ // host = "127.0.0.1"
+ // port = 4318
+ // metrics_endpoint = "/v1/metrics"
+ # Optionally, you can set a namespace to be used as OTel service.namespace attribute for all exported metrics.
+ // service_namespace = "icinga"
+
+ # By default, basic AUTH is disabled. Uncomment and set the following lines to enable it.
+ // basic_auth = {
+ // username = "otel_user"
+ // password = "otel_password"
+ // }
+
+ # You can also add custom tags to the exported metrics based on host and service variables.
+ # These tags will be included in the OTel metrics as resource attributes for hosts and services, respectively.
+ # By default, no additional tags are added. Adjust the templates as needed to include the desired variables.
+ // host_resource_attributes = {
+ // "host.vars.env" = "$host.vars.env$"
+ // "host.vars.os" = "$host.vars.os$"
+ // }
+ // service_resource_attributes = {
+ // "service.vars.env" = "$service.vars.env$"
+ // "service.vars.os" = "$service.vars.os$"
+ // }
+
+ # These are the default settings used by the OTel writer. Adjust them as needed.
+ # Please refer to the documentation for more details on each option.
+ // enable_ha = true
+ // flush_interval = 15s
+ // flush_threshold = 16*1024*1024
+ # When stopping Icinga 2, this timeout defines how long to wait for any pending OTel
+ # metrics to be sent before disconnecting and discarding them.
+ // disconnect_timeout = 10s
+
+ # Allow the OTLP writer to send the check thresholds as OTel metrics to the configured endpoint.
+ # By default, this is disabled but you can enable it to have the thresholds available in the `state_check.threshold` OTel metric.
+ // enable_send_thresholds = false
+
+ # You can enable TLS encryption by uncommenting and configuring the following options.
+ # By default, the OTel writer uses unencrypted connections (plain HTTP requests).
+ // enable_tls = false
+ // tls_insecure_noverify = false
+ // tls_ca_file = "/path/to/otel/ca.crt"
+ // tls_cert_file = "/path/to/otel/client.crt"
+ // tls_key_file = "/path/to/otel/client.key"
+}
diff --git a/icinga-app/CMakeLists.txt b/icinga-app/CMakeLists.txt
index a9358939548..88ad8bfdc2e 100644
--- a/icinga-app/CMakeLists.txt
+++ b/icinga-app/CMakeLists.txt
@@ -50,6 +50,10 @@ if(ICINGA2_WITH_NOTIFICATION)
list(APPEND icinga_app_SOURCES $)
endif()
+if(ICINGA2_WITH_OPENTELEMETRY)
+ list(APPEND icinga_app_SOURCES $)
+endif()
+
if(ICINGA2_WITH_PERFDATA)
list(APPEND icinga_app_SOURCES $)
endif()
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2eb3d18324a..ebb0ce404a5 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -50,6 +50,10 @@ if(ICINGA2_WITH_NOTIFICATION)
add_subdirectory(notification)
endif()
+if(ICINGA2_WITH_OPENTELEMETRY)
+ add_subdirectory(otel)
+endif()
+
if(ICINGA2_WITH_PERFDATA)
add_subdirectory(perfdata)
endif()
diff --git a/lib/base/application-environment.cpp b/lib/base/application-environment.cpp
index b310d7247aa..245dc936f27 100644
--- a/lib/base/application-environment.cpp
+++ b/lib/base/application-environment.cpp
@@ -6,6 +6,8 @@
using namespace icinga;
+AtomicOrLocked Application::m_EnvironmentId;
+
String Application::GetAppEnvironment()
{
Value defaultValue = Empty;
@@ -16,3 +18,29 @@ void Application::SetAppEnvironment(const String& name)
{
ScriptGlobal::Set("Environment", name);
}
+
+/**
+ * Get the cluster environment ID set by IcingaDB.
+ *
+ * This method returns the cluster environment ID generated by the IcingaDB component (if enabled).
+ * The environment ID is a unique identifier used to distinguish between different Icinga 2 clusters
+ * in a multi-cluster setup. It is typically set by IcingaDB when it starts up and can be used by other
+ * components (e.g., for telemetry) to correlate data across clusters. If IcingaDB is not enabled or has
+ * not yet set the environment ID, this method will return an empty string.
+ *
+ * @return The cluster environment ID set by IcingaDB, or an empty string if not set.
+ */
+String Application::GetEnvironmentId()
+{
+ return m_EnvironmentId.load();
+}
+
+/**
+ * Set the cluster environment ID.
+ *
+ * @param envID The cluster environment ID to set, typically generated by IcingaDB.
+ */
+void Application::SetEnvironmentId(const String& envID)
+{
+ m_EnvironmentId.store(envID);
+}
diff --git a/lib/base/application.hpp b/lib/base/application.hpp
index f9fdecc3c14..feec84488c4 100644
--- a/lib/base/application.hpp
+++ b/lib/base/application.hpp
@@ -96,6 +96,8 @@ class Application : public ObjectImpl {
static String GetAppEnvironment();
static void SetAppEnvironment(const String& name);
+ static String GetEnvironmentId();
+ static void SetEnvironmentId(const String& envID);
static double GetStartTime();
static void SetStartTime(double ts);
@@ -130,6 +132,8 @@ class Application : public ObjectImpl {
static pid_t m_ReloadProcess; /**< The PID of a subprocess doing a reload, only valid when l_Restarting==true */
static bool m_RequestReopenLogs; /**< Whether we should re-open log files. */
+ static AtomicOrLocked m_EnvironmentId; /**< The cluster environment ID set by IcingaDB. */
+
#ifndef _WIN32
static pid_t m_UmbrellaProcess; /**< The PID of the Icinga umbrella process */
#endif /* _WIN32 */
diff --git a/lib/icingadb/icingadb.cpp b/lib/icingadb/icingadb.cpp
index aa63ee27d5b..dcd0c524b00 100644
--- a/lib/icingadb/icingadb.cpp
+++ b/lib/icingadb/icingadb.cpp
@@ -56,6 +56,7 @@ void IcingaDB::Validate(int types, const ValidationUtils& utils)
try {
InitEnvironmentId();
+ Application::SetEnvironmentId(m_EnvironmentId);
} catch (const std::exception& e) {
BOOST_THROW_EXCEPTION(ValidationError(this, std::vector(), e.what()));
}
diff --git a/lib/otel/CMakeLists.txt b/lib/otel/CMakeLists.txt
new file mode 100644
index 00000000000..a73cd57a64e
--- /dev/null
+++ b/lib/otel/CMakeLists.txt
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: 2026 Icinga GmbH
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+set(ICINGA2_OPENTELEMETRY_PROTOS_DIR "${icinga2_SOURCE_DIR}/third-party/opentelemetry-proto")
+protobuf_generate(
+ LANGUAGE cpp
+ # According to the Protobuf docs[^1], the Protobuf compiler generates with the "LITE_RUNTIME" option much
+ # smaller code than the default optimze_for=SPEED option, which includes code for reflection, descriptors,
+ # and other features not needed by any part of the Icinga 2 OpenTelemetry integration. Thus, we use the "lite"
+ # option to generate code that only depend on the libprotobuf-lite instead of the full libprotobuf library.
+ #
+ # The only downside of using the lite runtime is that we won't be able to use any debugging capabilities
+ # provided by the full Protobuf runtime (like the DebugString() method on messages for easy printing,
+ # which heavily relies on reflection).
+ #
+ # [^1]: https://protobuf.dev/programming-guides/proto3/#options
+ PLUGIN_OPTIONS lite
+ OUT_VAR otel_PROTO_SRCS
+ IMPORT_DIRS "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}"
+ PROTOS
+ "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/collector/metrics/v1/metrics_service.proto"
+ "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/common/v1/common.proto"
+ "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/metrics/v1/metrics.proto"
+ "${ICINGA2_OPENTELEMETRY_PROTOS_DIR}/opentelemetry/proto/resource/v1/resource.proto"
+)
+
+set(otel_SOURCES
+ otel.cpp otel.hpp
+ ${otel_PROTO_SRCS}
+)
+
+add_library(otel OBJECT ${otel_SOURCES})
+add_dependencies(otel base remote)
+target_include_directories(otel
+ SYSTEM PUBLIC
+ $
+ ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+set_target_properties(
+ otel PROPERTIES
+ FOLDER Lib
+)
diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp
new file mode 100644
index 00000000000..f2a0d6acaec
--- /dev/null
+++ b/lib/otel/otel.cpp
@@ -0,0 +1,678 @@
+// SPDX-FileCopyrightText: 2026 Icinga GmbH
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "otel/otel.hpp"
+#include "base/application.hpp"
+#include "base/defer.hpp"
+#include "base/tcpsocket.hpp"
+#include "base/tlsutility.hpp"
+#include
+#include
+#include
+#include
+
+using namespace icinga;
+
+namespace http = boost::beast::http;
+namespace v1_metrics = opentelemetry::proto::metrics::v1;
+
+// The max buffer size used to batch Protobuf writes to Asio streams.
+static constexpr std::size_t l_BufferSize = 64UL * 1024;
+// The OpenTelemetry schema convention URL used in the exported metrics.
+// See https://opentelemetry.io/docs/specs/semconv/
+static constexpr std::string_view l_OTelSchemaConv = "https://opentelemetry.io/schemas/1.39.0";
+
+template std::size_t OTel::Record(Gauge&, int64_t, double, double, AttrsMap);
+template std::size_t OTel::Record(Gauge&, double, double, double, AttrsMap);
+template void OTel::SetAttribute(Attribute&, std::string_view&&, String&&);
+template void OTel::SetAttribute(Attribute&, String&&, Value&);
+
+/**
+ * Calculate the exponential backoff duration for retrying failed exports or reconnections.
+ *
+ * This method calculates the backoff duration based on the number of retry attempts using an exponential
+ * backoff strategy as per OTel specifications. The backoff duration starts at a minimum value and doubles
+ * with each attempt, up to a maximum cap (30s). This helps to avoid overwhelming the OpenTelemetry backend
+ * with rapid retry attempts in case of transient errors.
+ *
+ * @param attempt The current retry attempt number (starting from 1).
+ *
+ * @return The calculated backoff duration in milliseconds.
+ */
+static constexpr std::chrono::milliseconds Backoff(uint64_t attempt)
+{
+ using namespace std::chrono;
+
+ constexpr milliseconds MaxBackoffMs = seconds(30);
+ constexpr milliseconds MinBackoffMs = milliseconds(100);
+
+ // 2^attempt may overflow, so we cap it to a safe value within the 64-bit range,
+ // which is sufficient to reach MaxBackoffMs from MinBackoffMs.
+ constexpr uint64_t maxSafeAttempt = 16; // 2^16 * 100ms = 6553.6s > 30s
+ auto exponential = MinBackoffMs * (1ULL << std::min(attempt, maxSafeAttempt));
+ if (exponential >= MaxBackoffMs) {
+ return MaxBackoffMs;
+ }
+ return duration_cast(exponential);
+}
+
+OTel::OTel(OTelConnInfo& connInfo): OTel{connInfo, IoEngine::Get().GetIoContext()}
+{
+}
+
+OTel::OTel(OTelConnInfo& connInfo, boost::asio::io_context& io)
+ : m_ConnInfo{std::move(connInfo)},
+ m_Strand{io},
+ m_ExportAsioCV{io},
+ m_RetryExportAndConnTimer{io},
+ m_Exporting{false},
+ m_Stopped{false}
+{
+ if (m_ConnInfo.EnableTls) {
+ m_TlsContext = MakeAsioSslContext(m_ConnInfo.TlsCrt, m_ConnInfo.TlsKey, m_ConnInfo.TlsCaCrt);
+ }
+}
+
+void OTel::Start()
+{
+ if (m_Stopped.exchange(false)) {
+ ResetExporting(true);
+ }
+
+ IoEngine::SpawnCoroutine(m_Strand, [this, keepAlive = ConstPtr(this)](boost::asio::yield_context yc) {
+ ExportLoop(yc);
+ });
+}
+
+/**
+ * Stop the OTel exporter and disconnect from the OpenTelemetry backend.
+ *
+ * This method blocks until the exporter has fully stopped and disconnected from the backend.
+ * It cancels any ongoing export operations and clears all its internal state, so that it can be
+ * safely restarted later if needed.
+ */
+void OTel::Stop()
+{
+ if (m_Stopped.exchange(true)) {
+ return;
+ }
+
+ std::promise promise;
+ IoEngine::SpawnCoroutine(m_Strand, [this, &promise, keepAlive = ConstPtr(this)](boost::asio::yield_context& yc) {
+ m_ExportAsioCV.NotifyAll(); // Wake up the export loop if it's waiting for new export requests.
+ m_RetryExportAndConnTimer.cancel();
+
+ if (!m_Stream) {
+ promise.set_value();
+ return;
+ }
+
+ // We only wait for ongoing export operations to complete if we're currently exporting,
+ // otherwise there will be nothing that would wake us up from the `WaitForClear` sleep
+ // below, and we would end up blocking indefinitely, so we have to check the exporting
+ // state here first.
+ if (Exporting()) {
+ Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [this] {
+ boost::system::error_code ec;
+ std::visit([&ec](auto& stream) { stream->lowest_layer().cancel(ec); }, *m_Stream);
+ });
+ while (m_Request) {
+ m_ExportAsioCV.Wait(yc);
+ }
+ }
+
+ // Check if the stream is still valid before attempting to disconnect, since the above lowest_layer.cancel()
+ // may have caused the export loop to detect a broken connection and reset the stream already.
+ if (m_Stream) {
+ if (auto* tlsStreamPtr = std::get_if::Ptr>(&*m_Stream); tlsStreamPtr) {
+ (*tlsStreamPtr)->GracefulDisconnect(m_Strand, yc);
+ } else if (auto* tcpStreamPtr = std::get_if::Ptr>(&*m_Stream); tcpStreamPtr) {
+ boost::system::error_code ec;
+ (*tcpStreamPtr)->lowest_layer().shutdown(AsioTcpStream::lowest_layer_type::shutdown_both, ec);
+ (*tcpStreamPtr)->lowest_layer().close(ec);
+ }
+ }
+
+ Log(LogInformation, "OTelExporter")
+ << "Disconnected from OpenTelemetry backend.";
+
+ m_Stream.reset();
+ promise.set_value();
+ });
+ promise.get_future().wait();
+}
+
+/**
+ * Export the given OTel metrics request to the OpenTelemetry backend.
+ *
+ * This method initiates the export of the provided OTel metrics request to the configured
+ * OpenTelemetry backend. If an export is already in progress, it waits for the previous
+ * export to complete before proceeding with the new export request (blocking the caller).
+ *
+ * @param request The OTel metrics request to export.
+ */
+void OTel::Export(std::unique_ptr&& request)
+{
+ std::unique_lock lock(m_Mutex);
+ if (m_Exporting) {
+ Log(LogWarning, "OTelExporter")
+ << "Received export request while previous export is still in progress. Waiting for it to complete.";
+
+ m_ExportCV.wait(lock, [this] { return m_Stopped || !m_Exporting; });
+ if (m_Stopped) {
+ return;
+ }
+ }
+ m_Exporting = true;
+ lock.unlock();
+
+ // Access to m_Request is serialized via m_Strand, so we must post the actual export operation to it.
+ boost::asio::post(m_Strand, [this, keepAlive = ConstPtr(this), request = std::move(request)]() mutable {
+ m_Request = std::move(request);
+ m_ExportAsioCV.NotifyAll();
+ });
+}
+
+/**
+ * Populate the standard OTel resource attributes in the given ResourceMetrics Protobuf object.
+ *
+ * This method populates the standard OTel resource attributes as per OTel specifications[^1][^2]
+ * into the provided ResourceMetrics Protobuf object. It sets attributes such as service name,
+ * instance ID, version, and telemetry SDK information.
+ *
+ * @param rm The ResourceMetrics Protobuf object to populate.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/semconv/resource/#telemetry-sdk
+ * [^2]: https://opentelemetry.io/docs/specs/semconv/resource/service/
+ */
+void OTel::PopulateResourceAttrs(const std::unique_ptr& rm)
+{
+ using namespace std::string_view_literals;
+
+ rm->set_schema_url(l_OTelSchemaConv.data());
+ auto* resource = rm->mutable_resource();
+
+ auto* attr = resource->add_attributes();
+ SetAttribute(*attr, "service.name"sv, "Icinga 2"sv);
+
+ auto instanceID = Application::GetEnvironmentId();
+ if (instanceID.IsEmpty()) {
+ instanceID = "unknown";
+ }
+ attr = resource->add_attributes();
+ SetAttribute(*attr, "service.instance.id"sv, std::move(instanceID));
+
+ attr = resource->add_attributes();
+ SetAttribute(*attr, "service.version"sv, Application::GetAppVersion());
+
+ attr = resource->add_attributes();
+ // We don't actually use OTel SDKs here, but to comply with OTel specs, we need to provide these attributes anyway.
+ SetAttribute(*attr, "telemetry.sdk.language"sv, "cpp"sv);
+
+ attr = resource->add_attributes();
+ SetAttribute(*attr, "telemetry.sdk.name"sv, "Icinga 2 OTel Integration"sv);
+
+ attr = resource->add_attributes();
+ SetAttribute(*attr, "telemetry.sdk.version"sv, Application::GetAppVersion());
+
+ auto* ism = rm->add_scope_metrics();
+ ism->set_schema_url(l_OTelSchemaConv.data());
+ ism->mutable_scope()->set_name("icinga2");
+ ism->mutable_scope()->set_version(Application::GetAppVersion());
+}
+
+/**
+ * Establish a connection to the OpenTelemetry backend endpoint.
+ *
+ * In case of connection failures, it retries as per OTel spec[^1] with exponential backoff until a successful
+ * connection is established or the exporter is stopped. Therefore, @c m_Stream is not guaranteed to be valid
+ * after this method returns, so the caller must check it before using it.
+ *
+ * @param yc The Boost.Asio yield context for asynchronous operations.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/otlp/#otlphttp-connection
+ */
+void OTel::Connect(boost::asio::yield_context& yc)
+{
+ Log(LogInformation, "OTelExporter")
+ << "Connecting to OpenTelemetry backend on host '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "'.";
+
+ for (uint64_t attempt = 1; !m_Stopped; ++attempt) {
+ try {
+ decltype(m_Stream) stream;
+ if (m_ConnInfo.EnableTls) {
+ stream = Shared::Make(m_Strand.context(), *m_TlsContext, m_ConnInfo.Host);
+ } else {
+ stream = Shared::Make(m_Strand.context());
+ }
+
+ Timeout timeout{m_Strand, boost::posix_time::seconds(10), [this, stream] {
+ Log(LogCritical, "OTelExporter")
+ << "Timeout while connecting to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "', cancelling attempt.";
+
+ boost::system::error_code ec;
+ std::visit([&ec](auto& s) { s->lowest_layer().cancel(ec); }, *stream);
+ }};
+
+ std::visit([this, &yc](auto& streamArg) {
+ icinga::Connect(streamArg->lowest_layer(), m_ConnInfo.Host, std::to_string(m_ConnInfo.Port), yc);
+
+ if constexpr (std::is_same_v, Shared::Ptr>) {
+ streamArg->next_layer().async_handshake(AsioTlsStream::next_layer_type::client, yc);
+
+ if (m_ConnInfo.VerifyPeerCertificate && !streamArg->next_layer().IsVerifyOK()) {
+ BOOST_THROW_EXCEPTION(std::runtime_error(
+ "TLS certificate validation failed: " + streamArg->next_layer().GetVerifyError()
+ ));
+ }
+ }
+ }, *stream);
+
+ m_Stream = std::move(stream);
+
+ Log(LogInformation, "OTelExporter")
+ << "Successfully connected to OpenTelemetry backend.";
+ return;
+ } catch (const std::exception& ex) {
+ Log(m_Stopped ? LogDebug : LogCritical, "OTelExporter")
+ << "Cannot connect to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port
+ << "' (attempt #" << attempt << "): " << ex.what();
+
+ if (!m_Stopped) {
+ boost::system::error_code ec;
+ m_RetryExportAndConnTimer.expires_after(Backoff(attempt));
+ m_RetryExportAndConnTimer.async_wait(yc[ec]);
+ }
+ }
+ }
+}
+
+/**
+ * Main export loop for exporting OTel metrics to the configured backend.
+ *
+ * This method runs in a loop, waiting for new metrics to be available for export. In case of export failures,
+ * it retries the export as per OTel spec[^1] with exponential backoff until the export succeeds or the exporter
+ * is stopped. After a successful export, it clears the exported metrics from @c m_Request to make room for new metrics.
+ *
+ * @param yc The Asio yield context for asynchronous operations.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes
+ */
+void OTel::ExportLoop(boost::asio::yield_context& yc)
+{
+ Defer cleanup{[this] {
+ m_Request.reset();
+ m_ExportAsioCV.NotifyAll();
+ ResetExporting(true /* notify all */);
+ }};
+
+ namespace ch = std::chrono;
+
+ while (true) {
+ // Wait for a new export request to be available. If the exporter is stopped while waiting,
+ // we will be notified without a new request, so we also check the stopped state here to
+ // avoid waiting indefinitely in that case.
+ while (!m_Request && !m_Stopped) {
+ m_ExportAsioCV.Wait(yc);
+ }
+
+ if (m_Stopped) {
+ break;
+ }
+
+ if (!m_Stream) {
+ Connect(yc);
+ }
+
+ for (uint64_t attempt = 1; m_Stream && !m_Stopped; ++attempt) {
+ try {
+ ExportImpl(yc);
+ m_Request.reset();
+ m_ExportAsioCV.NotifyAll();
+ ResetExporting(false /* notify one */);
+ break;
+ } catch (const RetryableExportError& ex) {
+ ch::milliseconds retryAfter;
+ if (auto throttle = ex.Throttle(); throttle > 0ms) {
+ retryAfter = throttle;
+ } else {
+ retryAfter = Backoff(attempt);
+ }
+
+ Log(LogWarning, "OTelExporter")
+ << "Failed to export metrics to OpenTelemetry backend (attempt #" << attempt << "). Retrying in "
+ << retryAfter.count() << "ms.";
+
+ boost::system::error_code ec;
+ m_RetryExportAndConnTimer.expires_after(retryAfter);
+ m_RetryExportAndConnTimer.async_wait(yc[ec]);
+ } catch (const std::exception& ex) {
+ LogSeverity severity = LogCritical;
+ const auto* ser{dynamic_cast(&ex)};
+ // Since we don't have a proper connection health check mechanism, we assume that certain errors
+ // indicate a broken connection and force a reconnect in those cases. For the `end_of_stream` case,
+ // we downgrade the log severity to debug level since this is a normal occurrence when using an OTEL
+ // collector compatible backend that don't honor keep-alive connections (e.g., OpenSearch Data Prepper).
+ if (m_Stopped || (ser && ser->code() == http::error::end_of_stream)) {
+ severity = LogDebug;
+ }
+ Log{severity, "OTelExporter", DiagnosticInformation(ex, false)};
+ m_Stream.reset(); // Force reconnect on next export attempt.
+ }
+ }
+ }
+}
+
+void OTel::ExportImpl(boost::asio::yield_context& yc) const
+{
+ AsioProtobufOutStream outputS{*m_Stream, m_ConnInfo, yc};
+ [[maybe_unused]] auto serialized = m_Request->SerializeToZeroCopyStream(&outputS);
+ ASSERT(serialized);
+ // Must have completed chunk writing successfully, otherwise reading the response will hang forever.
+ if (!outputS.WriterDone()) {
+ BOOST_THROW_EXCEPTION(std::runtime_error("BUG: Protobuf output stream writer did not complete successfully."));
+ }
+
+ IncomingHttpResponse responseMsg{*m_Stream};
+ responseMsg.Parse(yc);
+
+ if (auto ct = responseMsg[http::field::content_type]; ct != "application/x-protobuf") {
+ if (responseMsg.result() == http::status::ok) {
+ // Some OpenTelemetry Collector compatible backends (e.g., Prometheus OTLP Receiver) respond with 200 OK
+ // but without the expected Protobuf content type. So, don't do anything here since the request succeeded.
+ return;
+ }
+ Log(LogWarning, "OTelExporter")
+ << "Unexpected Content-Type from OpenTelemetry backend '" << ct << "' (" << responseMsg.reason() << "):\n"
+ << responseMsg.body();
+ } else if (responseMsg.result_int() >= 200 && responseMsg.result_int() <= 299) {
+ // We've got a valid Protobuf response, so we've to deserialize the body to check for partial success.
+ // See https://opentelemetry.io/docs/specs/otlp/#partial-success-1.
+ google::protobuf::Arena arena;
+ auto* response = MetricsResponse::default_instance().New(&arena);
+ [[maybe_unused]] auto deserialized = response->ParseFromString(responseMsg.body());
+ ASSERT(deserialized);
+
+ if (response->has_partial_success()) {
+ const auto& ps = response->partial_success();
+ const auto& msg = ps.error_message();
+ if (ps.rejected_data_points() > 0 || !msg.empty()) {
+ Log(LogWarning, "OTelExporter")
+ << "OpenTelemetry backend reported partial success: " << (msg.empty() ? "" : msg)
+ << " (" << ps.rejected_data_points() << " metric data points rejected).";
+ }
+ }
+ } else if (IsRetryableExportError(responseMsg.result())) {
+ uint64_t throttleSeconds = 0;
+ if (auto throttle = responseMsg[http::field::retry_after]; !throttle.empty()) {
+ try {
+ throttleSeconds = boost::lexical_cast(throttle);
+ } catch (const std::exception& ex) {
+ Log(LogWarning, "OTelExporter")
+ << "Failed to parse 'Retry-After' header from OpenTelemetry backend response: " << ex.what();
+ }
+ }
+ BOOST_THROW_EXCEPTION(RetryableExportError{throttleSeconds});
+ } else {
+ Log(LogWarning, "OTelExporter")
+ << "OpenTelemetry backend responded with non-success and non-retryable status code "
+ << responseMsg.result_int() << " (" << responseMsg.reason() << ").\n" << responseMsg.body();
+ }
+}
+
+/**
+ * Reset the exporting state and notify waiters.
+ *
+ * This method resets the internal exporting state to indicate that no export is currently
+ * in progress. It then notifies either one or all waiters waiting for the export to complete,
+ * based on the @c notifyAll parameter.
+ *
+ * @param notifyAll If true, notifies all waiters; otherwise, notifies only one waiter.
+ */
+void OTel::ResetExporting(bool notifyAll)
+{
+ {
+ std::lock_guard lock(m_Mutex);
+ m_Exporting = false;
+ }
+ if (notifyAll) {
+ m_ExportCV.notify_all();
+ } else {
+ m_ExportCV.notify_one();
+ }
+}
+
+/**
+ * Validate the given OTel metric name according to OTel naming conventions[^1].
+ * Here's the ABNF definition for reference:
+ * @verbatim
+ * instrument-name = ALPHA 0*254 ("_" / "." / "-" / "/" / ALPHA / DIGIT)
+ * ALPHA = %x41-5A / %x61-7A; A-Z / a-z
+ * DIGIT = %x30-39 ; 0-9
+ * @endverbatim
+ *
+ * @param name The metric name to validate.
+ *
+ * @throws std::invalid_argument if the metric name is invalid.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/otel/metrics/api/#instrument-name-syntax
+ */
+void OTel::ValidateName(std::string_view name)
+{
+ if (name.empty() || name.size() > 255) {
+ BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must be between 1 and 255 characters long."));
+ }
+
+ auto isAlpha = [](char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); };
+ auto isDigit = [](char c) { return '0' <= c && c <= '9'; };
+ for (std::size_t i = 0; i < name.size(); ++i) {
+ auto c = name[i];
+ if (i == 0 && !isAlpha(c)) {
+ BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must start with an alphabetic character."));
+ }
+ if (!isAlpha(c) && !isDigit(c) && c != '_' && c != '.' && c != '-' && c != '/') {
+ BOOST_THROW_EXCEPTION(std::invalid_argument(
+ "OTel instrument name contains invalid character '" + std::string(1, c) + "'."
+ ));
+ }
+ }
+}
+
+/**
+ * Set the given OTel attribute key-value pair in the provided @c Attribute Protobuf object.
+ *
+ * This method sets the given key-value pair in the provided KeyValue Protobuf object according to
+ * OTel specifications[^1]. While the OTel specs[^2] allows a wider range of attr value types, we
+ * only support the most common/scalar types (Boolean, Number (double), and String) for simplicity.
+ *
+ * @param attr The OTel attribute Protobuf object to set the value for.
+ * @param key The attribute key to set. Must not be empty.
+ * @param value The Value object containing the value to set in the attribute.
+ *
+ * @throws std::invalid_argument if key is empty or if @c Value represents an unsupported attribute value type.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/otel/common/#attribute
+ * [^2]: https://opentelemetry.io/docs/specs/otel/common/#anyvalue
+ */
+template
+void OTel::SetAttribute(Attribute& attr, Key&& key, AttrVal&& value)
+{
+ if (begin(key) == end(key)) {
+ BOOST_THROW_EXCEPTION(std::invalid_argument("OTel attribute key must not be empty."));
+ }
+
+ if constexpr (std::is_rvalue_reference_v && std::is_same_v, String>) {
+ attr.set_key(std::move(key.GetData()));
+ } else {
+ attr.set_key(std::string{std::forward(key)});
+ }
+
+ constexpr bool isRvalReference = std::is_rvalue_reference_v;
+ if constexpr (isRvalReference && std::is_same_v, String>) {
+ attr.mutable_value()->set_string_value(std::move(value.GetData()));
+ } else if constexpr (std::is_constructible_v) {
+ attr.mutable_value()->set_string_value(std::string{std::forward(value)});
+ } else {
+ switch (value.GetType()) {
+ case ValueBoolean:
+ attr.mutable_value()->set_bool_value(value.template Get());
+ break;
+ case ValueNumber:
+ attr.mutable_value()->set_double_value(value.template Get());
+ break;
+ case ValueString:
+ if (isRvalReference) {
+ attr.mutable_value()->set_string_value(std::move(value.template Get().GetData()));
+ } else {
+ attr.mutable_value()->set_string_value(value.template Get().GetData());
+ }
+ break;
+ default:
+ BOOST_THROW_EXCEPTION(std::invalid_argument(
+ "OTel attribute value must be of type Boolean, Number, or String, got '" + value.GetTypeName() + "'."
+ ));
+ }
+ }
+}
+
+/**
+ * Record a data point in the given OTel Gauge metric stream with the provided value, timestamps, and attributes.
+ *
+ * This method adds a new data point to the provided Gauge Protobuf object with the given value, start and end
+ * timestamps, and a set of attributes. The value can be either an int64_t or a double, depending on the type
+ * of the Gauge. The timestamps are expected to be in seconds and will be converted to nanoseconds as required
+ * by OTel specifications. The attributes are provided as a map of key-value pairs and will be set in the data
+ * point according to OTel attribute specs.
+ *
+ * @tparam T The type of the data point value, which must be either int64_t or double.
+ *
+ * @param gauge The Gauge Protobuf object to record the data point in.
+ * @param data The value of the data point to record.
+ * @param start The start timestamp of the data point in seconds.
+ * @param end The end timestamp of the data point in seconds.
+ * @param attrs A map of attribute key-value pairs to set in the data point.
+ *
+ * @return The size in bytes of the recorded data point after serialization.
+ *
+ * @throws std::invalid_argument if any attribute key is empty or has an unsupported value type.
+ */
+template
+std::size_t OTel::Record(Gauge& gauge, T data, double start, double end, AttrsMap attrs)
+{
+ namespace ch = std::chrono;
+
+ auto* dataPoint = gauge.add_data_points();
+ if constexpr (std::is_same_v) {
+ dataPoint->set_as_double(data);
+ } else {
+ dataPoint->set_as_int(data);
+ }
+
+ dataPoint->set_start_time_unix_nano(
+ static_cast(ch::duration_cast(ch::duration(start)).count())
+ );
+ dataPoint->set_time_unix_nano(
+ static_cast(ch::duration_cast(ch::duration(end)).count())
+ );
+
+ while (!attrs.empty()) {
+ auto* attr = dataPoint->add_attributes();
+ auto node = attrs.extract(attrs.begin());
+ SetAttribute(*attr, std::move(node.key()), std::move(node.mapped()));
+ }
+ return dataPoint->ByteSizeLong();
+}
+
+/**
+ * Determine if the given HTTP status code represents a retryable export error as per OTel specs[^1].
+ *
+ * @param status The HTTP status code to check.
+ *
+ * @return true if the status code indicates a retryable error; false otherwise.
+ *
+ * [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes
+ */
+bool OTel::IsRetryableExportError(const http::status status)
+{
+ return status == http::status::too_many_requests
+ || status == http::status::bad_gateway
+ || status == http::status::service_unavailable
+ || status == http::status::gateway_timeout;
+}
+
+AsioProtobufOutStream::AsioProtobufOutStream(const AsioTlsOrTcpStream& stream, const OTelConnInfo& connInfo, boost::asio::yield_context yc)
+ : m_Writer{stream}, m_YieldContext{std::move(yc)}
+{
+ m_Writer.method(http::verb::post);
+ m_Writer.target(connInfo.MetricsEndpoint);
+ m_Writer.set(http::field::host, connInfo.Host + ":" + std::to_string(connInfo.Port));
+ m_Writer.set(http::field::content_type, "application/x-protobuf");
+ if (!connInfo.BasicAuth.IsEmpty()) {
+ m_Writer.set(http::field::authorization, "Basic " + connInfo.BasicAuth);
+ }
+ m_Writer.StartStreaming();
+}
+
+bool AsioProtobufOutStream::Next(void** data, int* size)
+{
+ if (m_Buffered == l_BufferSize) {
+ Flush();
+ }
+ // Prepare a new buffer segment that the Protobuf serializer can write into.
+ // The buffer size is fixed to l_BufferSize, and as seen above, we flush if the previous buffer
+ // segment was fully used (which is always the case on each Next call after the initial one), so
+ // we'll end up reusing the same memory region for each Next call because when we flush, we also
+ // consume the committed data, and that region becomes writable again.
+ auto buf = m_Writer.Prepare(l_BufferSize - m_Buffered);
+ *data = buf.data();
+ *size = static_cast(l_BufferSize);
+ m_Buffered = l_BufferSize;
+ return true;
+}
+
+void AsioProtobufOutStream::BackUp(int count)
+{
+ // Make sure we've not already finalized the HTTP body because BackUp
+ // is supposed to be called only after a preceding (final) Next call.
+ ASSERT(!m_Writer.Done());
+ ASSERT(static_cast(count) <= m_Buffered);
+ ASSERT(m_Buffered == l_BufferSize);
+ // If the last prepared buffer segment was not fully used, we need to adjust the buffered size,
+ // so that we don't commit unused memory regions with the below Flush() call. If count is zero,
+ // this adjustment is a no-op, and indicates that the entire buffer was used and there won't be
+ // any subsequent Next calls anymore (i.e., the Protobuf serialization is complete).
+ m_Buffered -= count;
+ Flush(true);
+}
+
+int64_t AsioProtobufOutStream::ByteCount() const
+{
+ return m_Pos + static_cast(m_Buffered);
+}
+
+/**
+ * Flush any buffered data to the underlying Asio stream.
+ *
+ * If the `finish` parameter is set to true, it indicates that no more data will
+ * be buffered/generated, and the HTTP body will be finalized accordingly.
+ *
+ * @param finish Whether this is the final flush operation.
+ */
+void AsioProtobufOutStream::Flush(bool finish)
+{
+ ASSERT(m_Buffered > 0 || finish);
+ m_Writer.Commit(m_Buffered);
+ m_Writer.Flush(m_YieldContext, finish);
+ m_Pos += static_cast(m_Buffered);
+ m_Buffered = 0;
+}
+
+/**
+ * Check if the underlying HTTP request writer has completed writing.
+ *
+ * @return true if the writer has finished writing; false otherwise.
+ */
+bool AsioProtobufOutStream::WriterDone()
+{
+ return m_Writer.Done();
+}
diff --git a/lib/otel/otel.hpp b/lib/otel/otel.hpp
new file mode 100644
index 00000000000..eb45bedffb7
--- /dev/null
+++ b/lib/otel/otel.hpp
@@ -0,0 +1,190 @@
+// SPDX-FileCopyrightText: 2026 Icinga GmbH
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "base/io-engine.hpp"
+#include "base/tlsstream.hpp"
+#include "base/shared.hpp"
+#include "base/shared-object.hpp"
+#include "base/string.hpp"
+#include "remote/httpmessage.hpp"
+#include "otel/opentelemetry/proto/collector/metrics/v1/metrics_service.pb.h"
+#include
+#include
+#include
+#include
+#include