Files
quic_ecs_dt/substrate/src/observability.rs
2026-05-12 11:21:40 -04:00

117 lines
4.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! M5 — Prometheus-format `/metrics` exporter installation and counter
//! pre-registration.
//!
//! Counters and histograms are emitted from the demux path
//! ([`crate::transport::server`]) and the world systems
//! ([`crate::world::ingest_system`], [`crate::world::simulation_system`],
//! [`crate::world::export_system`]). This module's only job is:
//!
//! 1. Install the global metrics recorder + HTTP listener on the existing
//! tokio runtime, once at startup.
//! 2. Pre-register every counter at value 0 so panels render "0" rather than
//! "No data" before the first event of a given kind fires.
//!
//! ## Runtime telemetry
//!
//! - `substrate_received_total{tier=t1|t2|t3}` — counter
//! - `substrate_dropped_total{tier=t1}` — counter (T1 lossy)
//! - `substrate_decode_errors_total{tier=t1|t2|t3}` — counter
//! - `substrate_t3_no_handler_total` — counter
//! - `substrate_latency_us{tier=t1|t2|t3}` — histogram
//! - `substrate_tick_hz` — gauge
//! - `substrate_entities` — gauge
//! - `substrate_channel_depth{tier=t1|t2|t3}` — gauge
//! - `substrate_channel_capacity{tier=t1|t2|t3}` — gauge
//! - `substrate_rss_bytes` — gauge
//!
//! ## Digital-twin surface (operator dashboard)
//!
//! - `sensor_aggregate{type=…, stat=count|mean|min|max}` — gauge
//! - `substrate_threshold_crossings_total{type, direction}` — counter
use std::net::SocketAddr;
use bevy::prelude::*;
use metrics::counter;
use metrics_exporter_prometheus::PrometheusBuilder;
use crate::config::AppConfig;
use crate::transport::SensorType;
use crate::transport::ecs::TokioHandle;
pub struct ObservabilityPlugin;
impl Plugin for ObservabilityPlugin {
fn build(&self, app: &mut App) {
let config = app
.world()
.get_resource::<AppConfig>()
.expect("AppConfig must be inserted before ObservabilityPlugin");
if !config.observability.metrics_enabled {
tracing::info!("metrics exporter disabled by config");
return;
}
let listen: SocketAddr = config
.observability
.metrics_listen
.parse()
.expect("invalid metrics_listen address in config");
let runtime_handle = app
.world()
.get_resource::<TokioHandle>()
.expect("TokioHandle must be inserted before ObservabilityPlugin (load order: transport plugin first)")
.0
.clone();
// PrometheusBuilder::install spawns the HTTP listener via tokio::spawn,
// which requires being inside a runtime context.
let _guard = runtime_handle.enter();
PrometheusBuilder::new()
.with_http_listener(listen)
.install()
.expect("install prometheus exporter");
drop(_guard);
tracing::info!(?listen, "metrics exporter installed");
pre_register_counters();
}
}
/// Pre-register every counter at value 0 so Grafana sees a series to plot
/// even before the first event of that kind. Without this, the Prometheus
/// exporter omits any counter that has never been incremented, and panels
/// render "No data" — confusing when the metric exists, the counter is just
/// genuinely zero (e.g., `substrate_t3_no_handler_total` in normal operation).
fn pre_register_counters() {
for tier in ["t1", "t2", "t3"] {
counter!("substrate_received_total", "tier" => tier).increment(0);
counter!("substrate_decode_errors_total", "tier" => tier).increment(0);
}
counter!("substrate_dropped_total", "tier" => "t1").increment(0);
counter!("substrate_t3_no_handler_total").increment(0);
// Threshold crossings — bounded `|SensorType| × 2` cardinality, all
// pre-registered so dashboard panels show "0" instead of "No data".
for t in [
SensorType::Generic,
SensorType::Temperature,
SensorType::Humidity,
SensorType::Pressure,
SensorType::Voltage,
SensorType::Current,
] {
for direction in ["up", "down"] {
counter!(
"substrate_threshold_crossings_total",
"type" => t.label_str(),
"direction" => direction
)
.increment(0);
}
}
}