Flip T3 to substrate-initiated actuator commands

This commit is contained in:
Valère Plantevin
2026-05-13 15:03:23 -04:00
parent 272d3b3c59
commit baa075fe0f
22 changed files with 1003 additions and 749 deletions

View File

@@ -25,6 +25,13 @@ pub struct QuicConfig {
pub t1_capacity: usize,
pub t2_capacity: usize,
pub t3_capacity: usize,
/// Bench-only knob. When > 0, the substrate spawns a synthetic T3
/// driver that issues toggling Relay commands to every connected device
/// at the configured rate, exercising the real outbound code path.
/// Off by default (0.0) in production. Override via env:
/// `APP_NETWORK__SYNTHETIC_T3_RATE_HZ=100`.
#[serde(default)]
pub synthetic_t3_rate_hz: f64,
}
#[derive(Debug, Serialize, Deserialize)]
@@ -47,6 +54,7 @@ impl Default for AppConfig {
t1_capacity: 1024,
t2_capacity: 512,
t3_capacity: 256,
synthetic_t3_rate_hz: 0.0,
},
simulation: SimulationConfig {
tick_rate_hz: 60,
@@ -65,7 +73,9 @@ impl AppConfig {
Figment::new()
.merge(Serialized::defaults(Self::default())) // compiled-in defaults
.merge(Toml::file(config_file)) // config file
.merge(Env::prefixed("APP_")) // env overrides, e.g. APP_NETWORK__PORT=9000
// env overrides — `__` is the nesting separator so
// `APP_NETWORK__SERVER_PORT=9001` overrides `network.server_port`.
.merge(Env::prefixed("APP_").split("__"))
.extract()
}
}

View File

@@ -6,26 +6,41 @@ use tokio::runtime::Handle;
use tokio::sync::mpsc;
use crate::config::AppConfig;
use crate::transport::{QuicMessage, T1Sender, T2Sender, T3Inbound, T3Sender};
use crate::transport::server::{accept_loop, bind_endpoint};
use crate::transport::{OutboundT3, QuicMessage, T1Sender, T2Sender, T3OutboundSender};
use crate::transport::server::{ConnectionRegistry, accept_loop, bind_endpoint, new_connection_registry};
use crate::transport::state::ServerState;
pub struct EcsQuicTransportPlugin;
/// Receive halves of the three tier channels, wrapped so they can sit in a
/// Bevy `Resource`. The `world` module's ingest system is the sole reader.
/// Receive halves of the inbound tier channels (T1 datagrams, T2 uni
/// streams). The `world` module's ingest system is the sole reader.
/// T3 is substrate-initiated and lives on the tokio side via the outbound
/// drain task — no inbound T3 receiver exists here.
#[derive(Resource)]
pub(crate) struct BridgeReceivers {
pub(crate) t1: Mutex<mpsc::Receiver<QuicMessage>>,
pub(crate) t2: Mutex<mpsc::Receiver<QuicMessage>>,
pub(crate) t3: Mutex<mpsc::Receiver<T3Inbound>>,
}
#[derive(Resource, Clone)]
pub(crate) struct BridgeSenders {
pub(crate) t1: T1Sender,
pub(crate) t2: T2Sender,
pub(crate) t3: T3Sender,
/// Outbound actuator-command sender — `automation_system` enqueues
/// `OutboundT3` items here; the tokio drain task routes them to the
/// originating device's connection.
pub(crate) t3_out: T3OutboundSender,
}
/// Holds the receiver half of the outbound-T3 channel until the listener
/// starts, plus the connection registry and a sender clone for the optional
/// synthetic T3 driver. All pass into `accept_loop` once at the
/// `Starting → Started` transition.
#[derive(Resource)]
pub(crate) struct OutboundT3Plumbing {
pub(crate) rx: Mutex<Option<mpsc::Receiver<OutboundT3>>>,
pub(crate) tx: mpsc::Sender<OutboundT3>,
pub(crate) registry: ConnectionRegistry,
}
#[derive(Resource, Clone)]
@@ -37,6 +52,7 @@ fn start_quic_server(
config: Res<AppConfig>,
senders: Res<BridgeSenders>,
runtime: Res<TokioHandle>,
outbound: Res<OutboundT3Plumbing>,
mut next: ResMut<NextState<ServerState>>,
) {
tracing::info!("entering ServerState::Starting — bringing up QUIC listener");
@@ -50,8 +66,29 @@ fn start_quic_server(
tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC listener bound");
// Move the outbound receiver into the tokio side; accept_loop owns it for
// the rest of the listener's life. The registry is cloned (it's already an
// `Arc`) so the ECS-side resource can still observe the routes if needed.
let outbound_rx = outbound
.rx
.lock()
.unwrap()
.take()
.expect("OutboundT3 receiver consumed twice");
let outbound_tx = outbound.tx.clone();
let registry = outbound.registry.clone();
let synthetic_rate = config.network.synthetic_t3_rate_hz;
let s = senders.clone();
runtime.0.spawn(accept_loop(endpoint, s.t1, s.t2, s.t3));
runtime.0.spawn(accept_loop(
endpoint,
s.t1,
s.t2,
registry,
outbound_rx,
outbound_tx,
synthetic_rate,
));
next.set(ServerState::Started);
tracing::info!("ServerState::Started");
@@ -60,11 +97,15 @@ fn start_quic_server(
impl Plugin for EcsQuicTransportPlugin {
fn build(&self, app: &mut App) {
let config = app.world_mut().resource::<AppConfig>();
// Three-tier bridge between the tokio-side QUIC accept loop and the
// Inbound bridge: T1 datagrams + T2 uni streams from devices into the
// ECS PreUpdate ingest system (in the `world` module).
let (t1_tx, t1_rx) = mpsc::channel::<QuicMessage>(config.network.t1_capacity);
let (t2_tx, t2_rx) = mpsc::channel::<QuicMessage>(config.network.t2_capacity);
let (t3_tx, t3_rx) = mpsc::channel::<T3Inbound>(config.network.t3_capacity);
// Outbound-T3: substrate → device actuator-command path. Capacity
// budget tracks automation cadence, not per-sample throughput.
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(config.network.t3_capacity);
let registry = new_connection_registry();
// Spawn a tokio runtime on a dedicated OS thread, ship its Handle back
// to the ECS, and keep the runtime alive for the lifetime of the app
@@ -96,12 +137,16 @@ impl Plugin for EcsQuicTransportPlugin {
.insert_resource(BridgeSenders {
t1: T1Sender::new(t1_tx),
t2: T2Sender::new(t2_tx),
t3: T3Sender::new(t3_tx),
t3_out: T3OutboundSender::new(t3_out_tx.clone()),
})
.insert_resource(BridgeReceivers {
t1: Mutex::new(t1_rx),
t2: Mutex::new(t2_rx),
t3: Mutex::new(t3_rx),
})
.insert_resource(OutboundT3Plumbing {
rx: Mutex::new(Some(t3_out_rx)),
tx: t3_out_tx,
registry,
})
.add_systems(OnEnter(ServerState::Starting), start_quic_server);
}

View File

@@ -2,7 +2,7 @@ pub mod ecs;
pub mod server;
pub mod state;
use tokio::sync::{mpsc, oneshot};
use tokio::sync::mpsc;
/// Logical type of a sensor reading. Travels in `QuicMessage::sensor_type`
/// so the substrate (and any downstream dashboard) knows which units / range
@@ -224,28 +224,36 @@ impl T2Sender {
}
}
/// Tier 3 — actuator command on a QUIC bidirectional stream, paired with a
/// `oneshot` channel the ECS uses to write the ack back over the same stream.
pub struct T3Inbound {
pub command: QuicMessage,
pub reply: oneshot::Sender<QuicMessage>,
/// Outbound T3 — actuator setpoint the substrate sends to a connected device.
/// The `automation_system` constructs these; the tokio-side drain task builds
/// the full `QuicMessage` (assigns timestamp + sequence) and opens a bi-stream
/// to the target device.
#[derive(Debug, Clone, Copy)]
pub struct OutboundT3 {
pub target_device: uuid::Uuid,
pub sensor_id: u16,
pub raw_value: f64,
/// `SensorType` discriminant of the actuator (typically `Relay`).
pub sensor_type: u8,
}
#[derive(Clone)]
pub struct T3Sender {
inner: mpsc::Sender<T3Inbound>,
pub struct T3OutboundSender {
inner: mpsc::Sender<OutboundT3>,
}
impl T3Sender {
pub fn new(inner: mpsc::Sender<T3Inbound>) -> Self {
impl T3OutboundSender {
pub fn new(inner: mpsc::Sender<OutboundT3>) -> Self {
Self { inner }
}
pub async fn send(
/// Non-blocking enqueue. Returns `Ok(())` on success; `Err` mirrors
/// tokio's `TrySendError` so callers can distinguish "full" from "closed".
pub fn try_send(
&self,
inbound: T3Inbound,
) -> Result<(), mpsc::error::SendError<T3Inbound>> {
self.inner.send(inbound).await
cmd: OutboundT3,
) -> Result<(), mpsc::error::TrySendError<OutboundT3>> {
self.inner.try_send(cmd)
}
pub fn depth(&self) -> usize {

View File

@@ -1,16 +1,50 @@
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::Arc;
use std::sync::{Arc, RwLock};
use std::time::Instant;
use anyhow::{Context, anyhow};
use metrics::counter;
use metrics::{counter, histogram};
use quinn::{
Connection, Endpoint, Incoming, RecvStream, SendStream, ServerConfig, StreamId, TransportConfig,
Connection, Endpoint, Incoming, RecvStream, ServerConfig, StreamId, TransportConfig,
};
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
use tokio::sync::oneshot;
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::config::QuicConfig;
use crate::transport::{QuicMessage, T1Sender, T2Sender, T3Inbound, T3Sender};
use crate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender};
/// Maps each known device UUID to the QUIC `Connection` that hosts it.
/// Several UUIDs typically point at the same `Connection` (one simulator
/// process commonly represents multiple virtual devices). `quinn::Connection`
/// is internally `Arc`-backed so cloning is cheap.
///
/// Held inside an `Arc<RwLock<…>>` so the tokio readers can register on first
/// message and `drain_outbound_t3` can look up routes at automation cadence.
/// Critical sections are tiny sync map ops — no `.await` while the lock is
/// held — so `std::sync::RwLock` is the right choice over `tokio::sync::*`.
pub type ConnectionRegistry = Arc<RwLock<HashMap<Uuid, Connection>>>;
pub fn new_connection_registry() -> ConnectionRegistry {
Arc::new(RwLock::new(HashMap::new()))
}
/// Insert (device → connection) if absent. Idempotent so it can be called
/// per-message without measurable cost on the hot ingest path.
fn ensure_registered(registry: &ConnectionRegistry, device_id: Uuid, conn: &Connection) {
let need_insert = {
let guard = registry.read().unwrap();
!guard.contains_key(&device_id)
};
if need_insert {
registry
.write()
.unwrap()
.entry(device_id)
.or_insert_with(|| conn.clone());
}
}
/// Datagram receive buffer in bytes. Sized to absorb microbursts at the
/// telemetry rates.
@@ -66,22 +100,102 @@ pub fn bind_endpoint(cfg: &QuicConfig) -> anyhow::Result<Endpoint> {
Endpoint::server(server_config, addr).context("Endpoint::server bind")
}
/// Accept loop: per-connection senders are cloned from the tier handles and
/// shipped into `handle_incoming` for orchestration.
pub async fn accept_loop(endpoint: Endpoint, t1: T1Sender, t2: T2Sender, t3: T3Sender) {
/// Accept loop. Owns the outbound-T3 drain task and the connection registry,
/// then clones per-connection state into `handle_incoming` for orchestration.
///
/// The drain task is spawned exactly once for the lifetime of the listener;
/// it routes ECS-issued `OutboundT3` commands to the right connection by
/// looking up `target_device` in the registry that `handle_incoming` populates.
///
/// Tier semantics: T1 datagrams + T2 uni streams come *in* from devices;
/// T3 bi streams are server-initiated for actuator commands and go *out*
/// via `drain_outbound_t3`. Devices never open bi streams to the substrate.
///
/// If `synthetic_t3_rate_hz > 0`, a bench-only task drives toggling Relay
/// commands at that rate through the same outbound channel — used by the
/// cross-tier isolation benchmark.
pub async fn accept_loop(
endpoint: Endpoint,
t1: T1Sender,
t2: T2Sender,
registry: ConnectionRegistry,
outbound_rx: mpsc::Receiver<OutboundT3>,
outbound_tx: mpsc::Sender<OutboundT3>,
synthetic_t3_rate_hz: f64,
) {
tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC accept loop running");
tokio::spawn(drain_outbound_t3(registry.clone(), outbound_rx));
if synthetic_t3_rate_hz > 0.0 {
tracing::info!(rate_hz = synthetic_t3_rate_hz, "synthetic T3 driver enabled");
tokio::spawn(synthetic_t3_driver(
registry.clone(),
outbound_tx.clone(),
synthetic_t3_rate_hz,
));
}
drop(outbound_tx);
while let Some(incoming) = endpoint.accept().await {
let t1 = t1.clone();
let t2 = t2.clone();
let t3 = t3.clone();
tokio::spawn(handle_incoming(incoming, t1, t2, t3));
let registry = registry.clone();
tokio::spawn(handle_incoming(incoming, t1, t2, registry));
}
tracing::info!("QUIC accept loop exited");
}
/// Per-connection orchestrator. Performs the handshake and spawns one reader
/// per tier, then waits for the connection to close and joins the readers.
async fn handle_incoming(incoming: Incoming, t1: T1Sender, t2: T2Sender, t3: T3Sender) {
/// Bench-only synthetic T3 driver. Round-robins over every registered device,
/// pushing a toggling Relay setpoint through the outbound channel at the
/// configured rate. Exercises the same code path as `automation_system`, so
/// the cross-tier-isolation bench measures the real path.
async fn synthetic_t3_driver(
registry: ConnectionRegistry,
tx: mpsc::Sender<OutboundT3>,
rate_hz: f64,
) {
let period = std::time::Duration::from_nanos((1.0e9 / rate_hz) as u64);
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
let mut next_value = 1.0;
loop {
ticker.tick().await;
// Snapshot device list under read lock; release before doing async work.
let devices: Vec<Uuid> = registry.read().unwrap().keys().copied().collect();
if devices.is_empty() {
continue;
}
for device in devices {
let cmd = OutboundT3 {
target_device: device,
sensor_id: 6,
raw_value: next_value,
sensor_type: SensorType::Relay.as_u8(),
};
if tx.try_send(cmd).is_err() {
counter!("substrate_t3_outbound_dropped_total").increment(1);
}
}
// Toggle for the next round so we exercise both setpoints.
next_value = if next_value > 0.5 { 0.0 } else { 1.0 };
}
}
/// Per-connection orchestrator. Performs the handshake and spawns the T1
/// datagram + T2 uni-stream readers; T3 outbound is handled connection-wide
/// by `drain_outbound_t3`. Waits for the connection to close, then purges
/// the registry and joins the inbound readers.
async fn handle_incoming(
incoming: Incoming,
t1: T1Sender,
t2: T2Sender,
registry: ConnectionRegistry,
) {
let conn = match incoming.await {
Ok(c) => c,
Err(e) => {
@@ -90,30 +204,34 @@ async fn handle_incoming(incoming: Incoming, t1: T1Sender, t2: T2Sender, t3: T3S
}
};
let remote = conn.remote_address();
tracing::info!(?remote, "connection established");
let stable_id = conn.stable_id();
tracing::info!(?remote, stable_id, "connection established");
// One task per tier — fully wired across T1/T2/T3.
let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1));
let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2));
let bi_task = tokio::spawn(accept_bi_streams(conn.clone(), t3));
let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1, registry.clone()));
let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2, registry.clone()));
let _ = conn.closed().await;
// Purge every device UUID that pointed at this connection. Cheap: 7 entries
// for an industrial-profile simulator, occasional disconnect.
registry
.write()
.unwrap()
.retain(|_, c| c.stable_id() != stable_id);
if let Err(e) = dgram_task.await {
tracing::warn!(?remote, error = %e, "T1 datagram task ended unexpectedly");
}
if let Err(e) = uni_task.await {
tracing::warn!(?remote, error = %e, "T2 uni stream task ended unexpectedly");
}
if let Err(e) = bi_task.await {
tracing::warn!(?remote, error = %e, "T3 bi stream task ended unexpectedly");
}
tracing::info!(?remote, "connection closed");
}
/// T1 — read QUIC datagrams, decode each as a fixed-size `QuicMessage`, push
/// into the lossy T1 channel.
async fn read_datagrams(conn: Connection, t1: T1Sender) {
/// into the lossy T1 channel. Registers the sending device in the connection
/// registry on first sight so outbound T3 commands can find this connection.
async fn read_datagrams(conn: Connection, t1: T1Sender, registry: ConnectionRegistry) {
let remote = conn.remote_address();
let mut received: u64 = 0;
let mut dropped: u64 = 0;
@@ -125,6 +243,7 @@ async fn read_datagrams(conn: Connection, t1: T1Sender) {
Ok(msg) => {
received += 1;
counter!("substrate_received_total", "tier" => "t1").increment(1);
ensure_registered(&registry, msg.device_id, &conn);
if !t1.send_lossy(msg) {
dropped += 1;
counter!("substrate_dropped_total", "tier" => "t1").increment(1);
@@ -161,7 +280,7 @@ async fn read_datagrams(conn: Connection, t1: T1Sender) {
/// reading 38-byte chunks until EOF (one stream may carry one event or many).
/// Cross-stream interleaving is allowed; ordering is only guaranteed *within*
/// a stream, matching QUIC's stream semantics.
async fn read_uni_streams(conn: Connection, t2: T2Sender) {
async fn read_uni_streams(conn: Connection, t2: T2Sender, registry: ConnectionRegistry) {
let remote = conn.remote_address();
let mut streams_accepted: u64 = 0;
@@ -180,14 +299,22 @@ async fn read_uni_streams(conn: Connection, t2: T2Sender) {
};
streams_accepted += 1;
let t2 = t2.clone();
tokio::spawn(read_one_uni_stream(remote, recv, t2));
let conn = conn.clone();
let registry = registry.clone();
tokio::spawn(read_one_uni_stream(remote, recv, t2, conn, registry));
}
}
/// Per-stream worker for T2. Reads fixed-size `QuicMessage`s back-to-back,
/// awaits backpressure on the T2 channel, and resets the stream on a decode
/// failure (one corrupt stream shouldn't take down the whole connection).
async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sender) {
async fn read_one_uni_stream(
remote: SocketAddr,
mut recv: RecvStream,
t2: T2Sender,
conn: Connection,
registry: ConnectionRegistry,
) {
let stream_id: StreamId = recv.id();
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
let mut count: u64 = 0;
@@ -198,6 +325,7 @@ async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sen
Ok(msg) => {
count += 1;
counter!("substrate_received_total", "tier" => "t2").increment(1);
ensure_registered(&registry, msg.device_id, &conn);
if t2.send(msg).await.is_err() {
// T2 receiver dropped (substrate shutting down).
tracing::warn!(
@@ -236,115 +364,107 @@ async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sen
}
}
/// T3 — accept bidirectional streams. Each stream is one command/ack
/// exchange, modeled per the paper's "per-command oneshot channels": the
/// reader pushes a `T3Inbound { command, reply }` to the ECS, awaits the
/// response on `reply_rx`, and writes it back on the same stream.
async fn accept_bi_streams(conn: Connection, t3: T3Sender) {
let remote = conn.remote_address();
let mut streams_accepted: u64 = 0;
/// T3 outbound drain — the substrate side of the actuator-command path.
///
/// Pops `OutboundT3` items the ECS produced, looks up the target device's
/// connection in the registry, and **spawns one tokio task per command** to
/// do the actual `open_bi() → write → finish → read_ack` round-trip. The
/// drain task itself never blocks on a per-command await, so a single stuck
/// `read_exact` (e.g. peer dropping mid-stream while Quinn's idle timeout
/// counts down) cannot stall the pipeline.
///
/// Per-stream task records `substrate_latency_us{tier="t3"}` from
/// `open_bi()` start to ack-receipt and increments
/// `substrate_received_total{tier="t3"}` on success.
///
/// Per-`(device, sensor)` sequence numbers are owned here so the wire-level
/// concerns stay out of the ECS.
async fn drain_outbound_t3(registry: ConnectionRegistry, mut rx: mpsc::Receiver<OutboundT3>) {
let mut seq_by_target: HashMap<(Uuid, u16), u32> = HashMap::new();
loop {
let (send, recv) = match conn.accept_bi().await {
Ok(s) => s,
Err(e) => {
while let Some(cmd) = rx.recv().await {
let conn = match registry.read().unwrap().get(&cmd.target_device).cloned() {
Some(c) => c,
None => {
counter!("substrate_t3_outbound_no_route_total").increment(1);
tracing::debug!(
?remote,
streams_accepted,
error = %e,
"T3 bi accept loop ended"
device = %cmd.target_device,
"outbound T3: no route, dropping"
);
return;
continue;
}
};
streams_accepted += 1;
let t3 = t3.clone();
tokio::spawn(read_one_bi_stream(remote, send, recv, t3));
let key = (cmd.target_device, cmd.sensor_id);
let seq = {
let s = seq_by_target.entry(key).or_insert(0);
let v = *s;
*s = s.wrapping_add(1);
v
};
let msg = QuicMessage {
device_id: cmd.target_device,
sensor_id: cmd.sensor_id,
raw_value: cmd.raw_value,
timestamp_us: now_us(),
sequence_number: seq,
sensor_type: cmd.sensor_type,
};
// One task per command. Concurrent in-flight bi-streams are
// first-class in QUIC, and this keeps the channel-drain loop hot.
tokio::spawn(async move {
let started = Instant::now();
match send_outbound_t3(&conn, &msg).await {
Ok(ack) => {
let elapsed_us = started.elapsed().as_micros() as f64;
histogram!("substrate_latency_us", "tier" => "t3").record(elapsed_us);
counter!("substrate_received_total", "tier" => "t3").increment(1);
tracing::trace!(
device = %msg.device_id,
sensor_id = msg.sensor_id,
raw = msg.raw_value,
ack_raw = ack.raw_value,
elapsed_us,
"outbound T3 completed"
);
}
Err(e) => {
counter!("substrate_t3_outbound_errors_total").increment(1);
tracing::warn!(
device = %msg.device_id,
sensor_id = msg.sensor_id,
error = %e,
"outbound T3 failed"
);
}
}
});
}
tracing::info!("outbound T3 drain task exited");
}
/// Per-stream worker for T3. Reads exactly one command, ships it with a
/// `oneshot::Sender` to the ECS, awaits the reply, writes it back. If the
/// ECS drops the oneshot (no handler installed), the stream is reset so the
/// client sees an explicit reset instead of a half-open stream.
async fn read_one_bi_stream(
remote: SocketAddr,
mut send: SendStream,
mut recv: RecvStream,
t3: T3Sender,
) {
let stream_id: StreamId = recv.id();
/// Single substrate-initiated T3 round-trip: open bi-stream, write command,
/// finish send half, read 39-byte ack, decode.
async fn send_outbound_t3(conn: &Connection, cmd: &QuicMessage) -> anyhow::Result<QuicMessage> {
let (mut send, mut recv) = conn.open_bi().await.context("open_bi for outbound T3")?;
send.write_all(&cmd.to_bytes())
.await
.context("write outbound T3 command")?;
send.finish().context("finish outbound T3 send half")?;
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
if let Err(e) = recv.read_exact(&mut buf).await {
tracing::trace!(
?remote,
?stream_id,
error = %e,
"T3: incomplete command read; closing"
);
return;
}
let command = match QuicMessage::decode(&buf) {
Ok(m) => m,
Err(e) => {
counter!("substrate_decode_errors_total", "tier" => "t3").increment(1);
tracing::warn!(
?remote,
?stream_id,
error = %e,
"T3 command decode failed; resetting stream"
);
let _ = recv.stop(0u32.into());
let _ = send.reset(0u32.into());
return;
}
};
counter!("substrate_received_total", "tier" => "t3").increment(1);
let (reply_tx, reply_rx) = oneshot::channel::<QuicMessage>();
let inbound = T3Inbound {
command,
reply: reply_tx,
};
if t3.send(inbound).await.is_err() {
tracing::warn!(?remote, ?stream_id, "T3 channel closed; abandoning command");
let _ = send.reset(0u32.into());
return;
}
let response = match reply_rx.await {
Ok(msg) => msg,
Err(_) => {
// ECS dropped the oneshot. With M4's handler installed this
// shouldn't happen normally; if it does, the stream is reset so
// the client sees a clean signal.
counter!("substrate_t3_no_handler_total").increment(1);
tracing::debug!(
?remote,
?stream_id,
"T3: no handler for command, resetting stream"
);
let _ = send.reset(0u32.into());
return;
}
};
if let Err(e) = send.write_all(&response.to_bytes()).await {
tracing::warn!(
?remote,
?stream_id,
error = %e,
"T3 ack write failed"
);
return;
}
if let Err(e) = send.finish() {
tracing::warn!(
?remote,
?stream_id,
error = %e,
"T3 ack finish failed"
);
}
recv.read_exact(&mut buf)
.await
.context("read outbound T3 ack")?;
QuicMessage::decode(&buf).context("decode outbound T3 ack")
}
fn now_us() -> u64 {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_micros() as u64)
.unwrap_or(0)
}

View File

@@ -13,9 +13,10 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use bevy::prelude::*;
use metrics::{counter, gauge, histogram};
use tokio::sync::mpsc::error::TrySendError;
use crate::transport::ecs::{BridgeReceivers, BridgeSenders};
use crate::transport::{QuicMessage, SensorType};
use crate::transport::{OutboundT3, QuicMessage, SensorType};
use super::components::{
Asset, DeviceId, RawSensorData, SensorId, SensorTypeTag, SmoothedValue, threshold_for,
@@ -26,12 +27,11 @@ use super::resources::{DiagnosticsState, ExportSampleState, SensorRegistry};
/// either drains next tick or gets dropped on full (T1's contract is lossy).
const T1_INGEST_BATCH: usize = 1024;
const T2_INGEST_BATCH: usize = 512;
const T3_INGEST_BATCH: usize = 256;
/// Drain the three tier channels into ECS state.
///
/// T1: bounded batch (lossy); T2: full drain (reliable); T3: full drain, with
/// each command answered by an ack carrying the device's current sensor value.
/// Drain the two inbound tier channels (T1 datagrams, T2 uni streams) into
/// ECS state. T1 is bounded-batch and lossy; T2 is fully drained per tick.
/// T3 is *outbound* (substrate → device, actuator commands) and lives in
/// the tokio runtime — see `transport::server::drain_outbound_t3`.
pub(super) fn ingest_system(
bridge: Res<BridgeReceivers>,
mut registry: ResMut<SensorRegistry>,
@@ -69,39 +69,6 @@ pub(super) fn ingest_system(
}
}
}
// T3 — bidirectional commands. Reply with the device's most recent
// sensor value (NaN if we've never seen this (device, sensor) before).
{
let mut t3 = bridge.t3.lock().unwrap();
for _ in 0..T3_INGEST_BATCH {
match t3.try_recv() {
Ok(inbound) => {
histogram!("substrate_latency_us", "tier" => "t3")
.record(now.saturating_sub(inbound.command.timestamp_us) as f64);
let key = (inbound.command.device_id, inbound.command.sensor_id);
let current_value = registry
.map
.get(&key)
.and_then(|&e| q.get(e).ok())
.map(|d| d.raw_value)
.unwrap_or(f64::NAN);
let ack = QuicMessage {
device_id: inbound.command.device_id,
sensor_id: inbound.command.sensor_id,
raw_value: current_value,
timestamp_us: now_us(),
sequence_number: inbound.command.sequence_number,
sensor_type: inbound.command.sensor_type,
};
// Ignore send errors: the demux task may have given up if the
// connection died while we were processing.
let _ = inbound.reply.send(ack);
}
Err(_) => break,
}
}
}
}
fn upsert_reading(
@@ -144,8 +111,17 @@ fn upsert_reading(
registry.map.insert(key, entity);
}
/// Closed-loop automation triggered by T1/T2 sensor data, affecting a T3 actuator.
/// Closed-loop automation: Presence threshold crossings trigger a T3 actuator
/// command going *out* to the originating device (substrate → simulator), and
/// a parallel local Relay-entity update so the operator dashboard reflects the
/// dispatched setpoint immediately (Grafana panels read the local ECS state).
///
/// The Relay actuator id is fixed at `6` in the industrial profile — see
/// `simulator/src/profile.rs::build_slots`.
const RELAY_SENSOR_ID: u16 = 6;
pub(super) fn automation_system(
senders: Res<BridgeSenders>,
mut registry: ResMut<SensorRegistry>,
mut commands: Commands,
mut p: ParamSet<(
@@ -156,7 +132,8 @@ pub(super) fn automation_system(
let mut triggers = Vec::new();
for (dev_id, tag, data) in p.p0().iter() {
if tag.0 == SensorType::Presence {
// Trigger threshold: 1.0 seconds
// Presence > 1.0 s ⇒ no occupancy detected ⇒ motor may run (relay 0).
// Presence < 1.0 s ⇒ occupancy detected ⇒ stop motor (relay 1).
let relay_state = if data.raw_value < 1.0 { 1.0 } else { 0.0 };
triggers.push((dev_id.0, relay_state));
}
@@ -164,15 +141,36 @@ pub(super) fn automation_system(
let mut q = p.p1();
for (device_id, relay_state) in triggers {
let msg = QuicMessage {
// 1) Dispatch the real actuator command to the device over T3.
let cmd = OutboundT3 {
target_device: device_id,
sensor_id: RELAY_SENSOR_ID,
raw_value: relay_state,
sensor_type: SensorType::Relay.as_u8(),
};
match senders.t3_out.try_send(cmd) {
Ok(()) => {}
Err(TrySendError::Full(_)) => {
counter!("substrate_t3_outbound_dropped_total").increment(1);
tracing::warn!(device = %device_id, "outbound T3 channel full; setpoint dropped");
}
Err(TrySendError::Closed(_)) => {
// Drain task is gone — substrate shutting down. Quiet log.
tracing::debug!("outbound T3 channel closed");
}
}
// 2) Mirror the setpoint into the local Relay entity so the dashboard
// sees automation activity without waiting for the device ack.
let mirror = QuicMessage {
device_id,
sensor_id: 6, // Relay is always 6 in our industrial profile
sensor_id: RELAY_SENSOR_ID,
raw_value: relay_state,
timestamp_us: now_us(),
sequence_number: 0,
sensor_type: SensorType::Relay.as_u8(),
};
upsert_reading(&mut registry, &mut commands, &mut q, msg);
upsert_reading(&mut registry, &mut commands, &mut q, mirror);
}
}
@@ -222,11 +220,11 @@ pub(super) fn export_system(
gauge!("substrate_channel_depth", "tier" => "t1").set(senders.t1.depth() as f64);
gauge!("substrate_channel_depth", "tier" => "t2").set(senders.t2.depth() as f64);
gauge!("substrate_channel_depth", "tier" => "t3").set(senders.t3.depth() as f64);
gauge!("substrate_channel_depth", "tier" => "t3").set(senders.t3_out.depth() as f64);
gauge!("substrate_channel_capacity", "tier" => "t1").set(senders.t1.capacity() as f64);
gauge!("substrate_channel_capacity", "tier" => "t2").set(senders.t2.capacity() as f64);
gauge!("substrate_channel_capacity", "tier" => "t3").set(senders.t3.capacity() as f64);
gauge!("substrate_channel_capacity", "tier" => "t3").set(senders.t3_out.capacity() as f64);
if let Some(stats) = memory_stats::memory_stats() {
gauge!("substrate_rss_bytes").set(stats.physical_mem as f64);

View File

@@ -8,12 +8,12 @@ use std::sync::Mutex;
use bevy::prelude::*;
use bevy::state::app::StatesPlugin;
use tokio::sync::{mpsc, oneshot};
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::transport::ecs::{BridgeReceivers, BridgeSenders};
use crate::transport::state::ServerState;
use crate::transport::{QuicMessage, SensorType, T1Sender, T2Sender, T3Inbound, T3Sender};
use crate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender, T3OutboundSender};
use super::WorldPlugin;
use super::components::{RawSensorData, SMOOTHED_WINDOW, SmoothedValue, threshold_for};
@@ -21,20 +21,22 @@ use super::resources::SensorRegistry;
/// Build a Bevy app with just enough plugins/resources to run the world
/// systems against test-owned channels. No QUIC, no tokio runtime.
///
/// Returns the app plus the T1/T2 send halves and the outbound-T3 receive
/// half — the latter so tests can observe `automation_system` dispatching.
fn make_test_app() -> (
App,
mpsc::Sender<QuicMessage>,
mpsc::Sender<QuicMessage>,
mpsc::Sender<T3Inbound>,
mpsc::Receiver<OutboundT3>,
) {
let (t1_tx, t1_rx) = mpsc::channel::<QuicMessage>(64);
let (t2_tx, t2_rx) = mpsc::channel::<QuicMessage>(64);
let (t3_tx, t3_rx) = mpsc::channel::<T3Inbound>(64);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let bridge = BridgeReceivers {
t1: Mutex::new(t1_rx),
t2: Mutex::new(t2_rx),
t3: Mutex::new(t3_rx),
};
// export_system samples channel depth/capacity from the senders; it
// requires the resource even when the test pushes via the raw senders
@@ -42,7 +44,7 @@ fn make_test_app() -> (
let senders = BridgeSenders {
t1: T1Sender::new(t1_tx.clone()),
t2: T2Sender::new(t2_tx.clone()),
t3: T3Sender::new(t3_tx.clone()),
t3_out: T3OutboundSender::new(t3_out_tx),
};
let mut app = App::new();
@@ -60,14 +62,14 @@ fn make_test_app() -> (
// Process the state transition before tests push messages.
app.update();
(app, t1_tx, t2_tx, t3_tx)
(app, t1_tx, t2_tx, t3_out_rx)
}
// ---- ingest_system: entity lifecycle and T3 ack semantics ----
// ---- ingest_system: entity lifecycle ----
#[test]
fn ingest_t1_creates_entity_and_writes_raw_data() {
let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app();
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::from_u128(0xa1a2_a3a4_a5a6_a7a8_a9aa_abac_adae_afb0);
let msg = QuicMessage {
@@ -103,7 +105,7 @@ fn ingest_t1_creates_entity_and_writes_raw_data() {
#[test]
fn ingest_t1_repeated_messages_update_in_place() {
let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app();
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
// First reading.
@@ -143,54 +145,46 @@ fn ingest_t1_repeated_messages_update_in_place() {
}
#[test]
fn ingest_t3_replies_with_current_sensor_value() {
let (mut app, t1_tx, _t2_tx, t3_tx) = make_test_app();
fn automation_dispatches_relay_stop_when_presence_drops() {
// The automation_system runs after simulation_system, which only emits a
// crossing when the *smoothed* mean transitions; for this test we just
// confirm that a Presence reading below threshold ends up enqueued as an
// OutboundT3 Relay=stop command. Repeated below-threshold pushes prime
// the rolling mean.
let (mut app, t1_tx, _t2_tx, mut t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
// Seed a T1 reading so the (device, sensor) entity exists.
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 9,
raw_value: 42.0,
timestamp_us: 1,
sequence_number: 1,
sensor_type: SensorType::Temperature.as_u8(),
})
.unwrap();
app.update();
app.update();
// Send a T3 command and capture the ack via the oneshot.
let (reply_tx, reply_rx) = oneshot::channel();
t3_tx
.try_send(T3Inbound {
command: QuicMessage {
for seq in 0..SMOOTHED_WINDOW as u32 {
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 9,
raw_value: 0.0,
timestamp_us: 0,
sequence_number: 7,
sensor_type: SensorType::Temperature.as_u8(),
},
reply: reply_tx,
})
.unwrap();
app.update();
sensor_id: 5,
raw_value: 0.5, // below the 1.0 s threshold
timestamp_us: u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Presence.as_u8(),
})
.unwrap();
app.update();
app.update();
}
let ack = reply_rx
.blocking_recv()
.expect("ECS handler should have replied");
assert_eq!(ack.device_id, device);
assert_eq!(ack.sensor_id, 9);
assert_eq!(ack.sequence_number, 7, "ack preserves correlation id");
assert_eq!(ack.raw_value, 42.0, "ack carries the latest sensor reading");
assert_eq!(
ack.typ(),
SensorType::Temperature,
"ack preserves sensor type"
// Drain whatever automation dispatched. We expect at least one Relay=stop
// command targeting the device.
let mut saw_stop = false;
while let Ok(cmd) = t3_out_rx.try_recv() {
if cmd.target_device == device
&& cmd.sensor_type == SensorType::Relay.as_u8()
&& cmd.raw_value > 0.5
{
saw_stop = true;
}
}
assert!(
saw_stop,
"automation_system should have enqueued an outbound Relay=stop \
command for {device} after sustained sub-threshold Presence readings"
);
assert!(ack.timestamp_us > 0, "ack stamped with server time");
}
// ---- SmoothedValue unit tests ----
@@ -240,7 +234,7 @@ fn smoothed_value_ignores_nonfinite() {
#[test]
fn simulation_smoothes_and_detects_threshold_crossing() {
let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app();
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
let threshold = threshold_for(SensorType::Temperature); // 22.0 °C