First test kinda working

This commit is contained in:
Valère Plantevin
2026-05-12 11:21:40 -04:00
parent cac6c9ac02
commit d3f09ee062
36 changed files with 3903 additions and 102 deletions

View File

@@ -0,0 +1,116 @@
//! M5 — Prometheus-format `/metrics` exporter installation and counter
//! pre-registration.
//!
//! Counters and histograms are emitted from the demux path
//! ([`crate::transport::server`]) and the world systems
//! ([`crate::world::ingest_system`], [`crate::world::simulation_system`],
//! [`crate::world::export_system`]). This module's only job is:
//!
//! 1. Install the global metrics recorder + HTTP listener on the existing
//! tokio runtime, once at startup.
//! 2. Pre-register every counter at value 0 so panels render "0" rather than
//! "No data" before the first event of a given kind fires.
//!
//! ## Runtime telemetry
//!
//! - `substrate_received_total{tier=t1|t2|t3}` — counter
//! - `substrate_dropped_total{tier=t1}` — counter (T1 lossy)
//! - `substrate_decode_errors_total{tier=t1|t2|t3}` — counter
//! - `substrate_t3_no_handler_total` — counter
//! - `substrate_latency_us{tier=t1|t2|t3}` — histogram
//! - `substrate_tick_hz` — gauge
//! - `substrate_entities` — gauge
//! - `substrate_channel_depth{tier=t1|t2|t3}` — gauge
//! - `substrate_channel_capacity{tier=t1|t2|t3}` — gauge
//! - `substrate_rss_bytes` — gauge
//!
//! ## Digital-twin surface (operator dashboard)
//!
//! - `sensor_aggregate{type=…, stat=count|mean|min|max}` — gauge
//! - `substrate_threshold_crossings_total{type, direction}` — counter
use std::net::SocketAddr;
use bevy::prelude::*;
use metrics::counter;
use metrics_exporter_prometheus::PrometheusBuilder;
use crate::config::AppConfig;
use crate::transport::SensorType;
use crate::transport::ecs::TokioHandle;
pub struct ObservabilityPlugin;
impl Plugin for ObservabilityPlugin {
fn build(&self, app: &mut App) {
let config = app
.world()
.get_resource::<AppConfig>()
.expect("AppConfig must be inserted before ObservabilityPlugin");
if !config.observability.metrics_enabled {
tracing::info!("metrics exporter disabled by config");
return;
}
let listen: SocketAddr = config
.observability
.metrics_listen
.parse()
.expect("invalid metrics_listen address in config");
let runtime_handle = app
.world()
.get_resource::<TokioHandle>()
.expect("TokioHandle must be inserted before ObservabilityPlugin (load order: transport plugin first)")
.0
.clone();
// PrometheusBuilder::install spawns the HTTP listener via tokio::spawn,
// which requires being inside a runtime context.
let _guard = runtime_handle.enter();
PrometheusBuilder::new()
.with_http_listener(listen)
.install()
.expect("install prometheus exporter");
drop(_guard);
tracing::info!(?listen, "metrics exporter installed");
pre_register_counters();
}
}
/// Pre-register every counter at value 0 so Grafana sees a series to plot
/// even before the first event of that kind. Without this, the Prometheus
/// exporter omits any counter that has never been incremented, and panels
/// render "No data" — confusing when the metric exists, the counter is just
/// genuinely zero (e.g., `substrate_t3_no_handler_total` in normal operation).
fn pre_register_counters() {
for tier in ["t1", "t2", "t3"] {
counter!("substrate_received_total", "tier" => tier).increment(0);
counter!("substrate_decode_errors_total", "tier" => tier).increment(0);
}
counter!("substrate_dropped_total", "tier" => "t1").increment(0);
counter!("substrate_t3_no_handler_total").increment(0);
// Threshold crossings — bounded `|SensorType| × 2` cardinality, all
// pre-registered so dashboard panels show "0" instead of "No data".
for t in [
SensorType::Generic,
SensorType::Temperature,
SensorType::Humidity,
SensorType::Pressure,
SensorType::Voltage,
SensorType::Current,
] {
for direction in ["up", "down"] {
counter!(
"substrate_threshold_crossings_total",
"type" => t.label_str(),
"direction" => direction
)
.increment(0);
}
}
}