First test kinda working
This commit is contained in:
116
substrate/src/observability.rs
Normal file
116
substrate/src/observability.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
//! M5 — Prometheus-format `/metrics` exporter installation and counter
|
||||
//! pre-registration.
|
||||
//!
|
||||
//! Counters and histograms are emitted from the demux path
|
||||
//! ([`crate::transport::server`]) and the world systems
|
||||
//! ([`crate::world::ingest_system`], [`crate::world::simulation_system`],
|
||||
//! [`crate::world::export_system`]). This module's only job is:
|
||||
//!
|
||||
//! 1. Install the global metrics recorder + HTTP listener on the existing
|
||||
//! tokio runtime, once at startup.
|
||||
//! 2. Pre-register every counter at value 0 so panels render "0" rather than
|
||||
//! "No data" before the first event of a given kind fires.
|
||||
//!
|
||||
//! ## Runtime telemetry
|
||||
//!
|
||||
//! - `substrate_received_total{tier=t1|t2|t3}` — counter
|
||||
//! - `substrate_dropped_total{tier=t1}` — counter (T1 lossy)
|
||||
//! - `substrate_decode_errors_total{tier=t1|t2|t3}` — counter
|
||||
//! - `substrate_t3_no_handler_total` — counter
|
||||
//! - `substrate_latency_us{tier=t1|t2|t3}` — histogram
|
||||
//! - `substrate_tick_hz` — gauge
|
||||
//! - `substrate_entities` — gauge
|
||||
//! - `substrate_channel_depth{tier=t1|t2|t3}` — gauge
|
||||
//! - `substrate_channel_capacity{tier=t1|t2|t3}` — gauge
|
||||
//! - `substrate_rss_bytes` — gauge
|
||||
//!
|
||||
//! ## Digital-twin surface (operator dashboard)
|
||||
//!
|
||||
//! - `sensor_aggregate{type=…, stat=count|mean|min|max}` — gauge
|
||||
//! - `substrate_threshold_crossings_total{type, direction}` — counter
|
||||
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use bevy::prelude::*;
|
||||
use metrics::counter;
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
|
||||
use crate::config::AppConfig;
|
||||
use crate::transport::SensorType;
|
||||
use crate::transport::ecs::TokioHandle;
|
||||
|
||||
pub struct ObservabilityPlugin;
|
||||
|
||||
impl Plugin for ObservabilityPlugin {
|
||||
fn build(&self, app: &mut App) {
|
||||
let config = app
|
||||
.world()
|
||||
.get_resource::<AppConfig>()
|
||||
.expect("AppConfig must be inserted before ObservabilityPlugin");
|
||||
|
||||
if !config.observability.metrics_enabled {
|
||||
tracing::info!("metrics exporter disabled by config");
|
||||
return;
|
||||
}
|
||||
|
||||
let listen: SocketAddr = config
|
||||
.observability
|
||||
.metrics_listen
|
||||
.parse()
|
||||
.expect("invalid metrics_listen address in config");
|
||||
|
||||
let runtime_handle = app
|
||||
.world()
|
||||
.get_resource::<TokioHandle>()
|
||||
.expect("TokioHandle must be inserted before ObservabilityPlugin (load order: transport plugin first)")
|
||||
.0
|
||||
.clone();
|
||||
|
||||
// PrometheusBuilder::install spawns the HTTP listener via tokio::spawn,
|
||||
// which requires being inside a runtime context.
|
||||
let _guard = runtime_handle.enter();
|
||||
PrometheusBuilder::new()
|
||||
.with_http_listener(listen)
|
||||
.install()
|
||||
.expect("install prometheus exporter");
|
||||
drop(_guard);
|
||||
|
||||
tracing::info!(?listen, "metrics exporter installed");
|
||||
|
||||
pre_register_counters();
|
||||
}
|
||||
}
|
||||
|
||||
/// Pre-register every counter at value 0 so Grafana sees a series to plot
|
||||
/// even before the first event of that kind. Without this, the Prometheus
|
||||
/// exporter omits any counter that has never been incremented, and panels
|
||||
/// render "No data" — confusing when the metric exists, the counter is just
|
||||
/// genuinely zero (e.g., `substrate_t3_no_handler_total` in normal operation).
|
||||
fn pre_register_counters() {
|
||||
for tier in ["t1", "t2", "t3"] {
|
||||
counter!("substrate_received_total", "tier" => tier).increment(0);
|
||||
counter!("substrate_decode_errors_total", "tier" => tier).increment(0);
|
||||
}
|
||||
counter!("substrate_dropped_total", "tier" => "t1").increment(0);
|
||||
counter!("substrate_t3_no_handler_total").increment(0);
|
||||
|
||||
// Threshold crossings — bounded `|SensorType| × 2` cardinality, all
|
||||
// pre-registered so dashboard panels show "0" instead of "No data".
|
||||
for t in [
|
||||
SensorType::Generic,
|
||||
SensorType::Temperature,
|
||||
SensorType::Humidity,
|
||||
SensorType::Pressure,
|
||||
SensorType::Voltage,
|
||||
SensorType::Current,
|
||||
] {
|
||||
for direction in ["up", "down"] {
|
||||
counter!(
|
||||
"substrate_threshold_crossings_total",
|
||||
"type" => t.label_str(),
|
||||
"direction" => direction
|
||||
)
|
||||
.increment(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user