Flip T3 to substrate-initiated actuator commands
This commit is contained in:
@@ -1,16 +1,50 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use metrics::counter;
|
||||
use metrics::{counter, histogram};
|
||||
use quinn::{
|
||||
Connection, Endpoint, Incoming, RecvStream, SendStream, ServerConfig, StreamId, TransportConfig,
|
||||
Connection, Endpoint, Incoming, RecvStream, ServerConfig, StreamId, TransportConfig,
|
||||
};
|
||||
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::sync::mpsc;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::config::QuicConfig;
|
||||
use crate::transport::{QuicMessage, T1Sender, T2Sender, T3Inbound, T3Sender};
|
||||
use crate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender};
|
||||
|
||||
/// Maps each known device UUID to the QUIC `Connection` that hosts it.
|
||||
/// Several UUIDs typically point at the same `Connection` (one simulator
|
||||
/// process commonly represents multiple virtual devices). `quinn::Connection`
|
||||
/// is internally `Arc`-backed so cloning is cheap.
|
||||
///
|
||||
/// Held inside an `Arc<RwLock<…>>` so the tokio readers can register on first
|
||||
/// message and `drain_outbound_t3` can look up routes at automation cadence.
|
||||
/// Critical sections are tiny sync map ops — no `.await` while the lock is
|
||||
/// held — so `std::sync::RwLock` is the right choice over `tokio::sync::*`.
|
||||
pub type ConnectionRegistry = Arc<RwLock<HashMap<Uuid, Connection>>>;
|
||||
|
||||
pub fn new_connection_registry() -> ConnectionRegistry {
|
||||
Arc::new(RwLock::new(HashMap::new()))
|
||||
}
|
||||
|
||||
/// Insert (device → connection) if absent. Idempotent so it can be called
|
||||
/// per-message without measurable cost on the hot ingest path.
|
||||
fn ensure_registered(registry: &ConnectionRegistry, device_id: Uuid, conn: &Connection) {
|
||||
let need_insert = {
|
||||
let guard = registry.read().unwrap();
|
||||
!guard.contains_key(&device_id)
|
||||
};
|
||||
if need_insert {
|
||||
registry
|
||||
.write()
|
||||
.unwrap()
|
||||
.entry(device_id)
|
||||
.or_insert_with(|| conn.clone());
|
||||
}
|
||||
}
|
||||
|
||||
/// Datagram receive buffer in bytes. Sized to absorb microbursts at the
|
||||
/// telemetry rates.
|
||||
@@ -66,22 +100,102 @@ pub fn bind_endpoint(cfg: &QuicConfig) -> anyhow::Result<Endpoint> {
|
||||
Endpoint::server(server_config, addr).context("Endpoint::server bind")
|
||||
}
|
||||
|
||||
/// Accept loop: per-connection senders are cloned from the tier handles and
|
||||
/// shipped into `handle_incoming` for orchestration.
|
||||
pub async fn accept_loop(endpoint: Endpoint, t1: T1Sender, t2: T2Sender, t3: T3Sender) {
|
||||
/// Accept loop. Owns the outbound-T3 drain task and the connection registry,
|
||||
/// then clones per-connection state into `handle_incoming` for orchestration.
|
||||
///
|
||||
/// The drain task is spawned exactly once for the lifetime of the listener;
|
||||
/// it routes ECS-issued `OutboundT3` commands to the right connection by
|
||||
/// looking up `target_device` in the registry that `handle_incoming` populates.
|
||||
///
|
||||
/// Tier semantics: T1 datagrams + T2 uni streams come *in* from devices;
|
||||
/// T3 bi streams are server-initiated for actuator commands and go *out*
|
||||
/// via `drain_outbound_t3`. Devices never open bi streams to the substrate.
|
||||
///
|
||||
/// If `synthetic_t3_rate_hz > 0`, a bench-only task drives toggling Relay
|
||||
/// commands at that rate through the same outbound channel — used by the
|
||||
/// cross-tier isolation benchmark.
|
||||
pub async fn accept_loop(
|
||||
endpoint: Endpoint,
|
||||
t1: T1Sender,
|
||||
t2: T2Sender,
|
||||
registry: ConnectionRegistry,
|
||||
outbound_rx: mpsc::Receiver<OutboundT3>,
|
||||
outbound_tx: mpsc::Sender<OutboundT3>,
|
||||
synthetic_t3_rate_hz: f64,
|
||||
) {
|
||||
tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC accept loop running");
|
||||
|
||||
tokio::spawn(drain_outbound_t3(registry.clone(), outbound_rx));
|
||||
|
||||
if synthetic_t3_rate_hz > 0.0 {
|
||||
tracing::info!(rate_hz = synthetic_t3_rate_hz, "synthetic T3 driver enabled");
|
||||
tokio::spawn(synthetic_t3_driver(
|
||||
registry.clone(),
|
||||
outbound_tx.clone(),
|
||||
synthetic_t3_rate_hz,
|
||||
));
|
||||
}
|
||||
drop(outbound_tx);
|
||||
|
||||
while let Some(incoming) = endpoint.accept().await {
|
||||
let t1 = t1.clone();
|
||||
let t2 = t2.clone();
|
||||
let t3 = t3.clone();
|
||||
tokio::spawn(handle_incoming(incoming, t1, t2, t3));
|
||||
let registry = registry.clone();
|
||||
tokio::spawn(handle_incoming(incoming, t1, t2, registry));
|
||||
}
|
||||
tracing::info!("QUIC accept loop exited");
|
||||
}
|
||||
|
||||
/// Per-connection orchestrator. Performs the handshake and spawns one reader
|
||||
/// per tier, then waits for the connection to close and joins the readers.
|
||||
async fn handle_incoming(incoming: Incoming, t1: T1Sender, t2: T2Sender, t3: T3Sender) {
|
||||
/// Bench-only synthetic T3 driver. Round-robins over every registered device,
|
||||
/// pushing a toggling Relay setpoint through the outbound channel at the
|
||||
/// configured rate. Exercises the same code path as `automation_system`, so
|
||||
/// the cross-tier-isolation bench measures the real path.
|
||||
async fn synthetic_t3_driver(
|
||||
registry: ConnectionRegistry,
|
||||
tx: mpsc::Sender<OutboundT3>,
|
||||
rate_hz: f64,
|
||||
) {
|
||||
let period = std::time::Duration::from_nanos((1.0e9 / rate_hz) as u64);
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
|
||||
let mut next_value = 1.0;
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
|
||||
// Snapshot device list under read lock; release before doing async work.
|
||||
let devices: Vec<Uuid> = registry.read().unwrap().keys().copied().collect();
|
||||
if devices.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for device in devices {
|
||||
let cmd = OutboundT3 {
|
||||
target_device: device,
|
||||
sensor_id: 6,
|
||||
raw_value: next_value,
|
||||
sensor_type: SensorType::Relay.as_u8(),
|
||||
};
|
||||
if tx.try_send(cmd).is_err() {
|
||||
counter!("substrate_t3_outbound_dropped_total").increment(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Toggle for the next round so we exercise both setpoints.
|
||||
next_value = if next_value > 0.5 { 0.0 } else { 1.0 };
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-connection orchestrator. Performs the handshake and spawns the T1
|
||||
/// datagram + T2 uni-stream readers; T3 outbound is handled connection-wide
|
||||
/// by `drain_outbound_t3`. Waits for the connection to close, then purges
|
||||
/// the registry and joins the inbound readers.
|
||||
async fn handle_incoming(
|
||||
incoming: Incoming,
|
||||
t1: T1Sender,
|
||||
t2: T2Sender,
|
||||
registry: ConnectionRegistry,
|
||||
) {
|
||||
let conn = match incoming.await {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
@@ -90,30 +204,34 @@ async fn handle_incoming(incoming: Incoming, t1: T1Sender, t2: T2Sender, t3: T3S
|
||||
}
|
||||
};
|
||||
let remote = conn.remote_address();
|
||||
tracing::info!(?remote, "connection established");
|
||||
let stable_id = conn.stable_id();
|
||||
tracing::info!(?remote, stable_id, "connection established");
|
||||
|
||||
// One task per tier — fully wired across T1/T2/T3.
|
||||
let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1));
|
||||
let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2));
|
||||
let bi_task = tokio::spawn(accept_bi_streams(conn.clone(), t3));
|
||||
let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1, registry.clone()));
|
||||
let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2, registry.clone()));
|
||||
|
||||
let _ = conn.closed().await;
|
||||
|
||||
// Purge every device UUID that pointed at this connection. Cheap: 7 entries
|
||||
// for an industrial-profile simulator, occasional disconnect.
|
||||
registry
|
||||
.write()
|
||||
.unwrap()
|
||||
.retain(|_, c| c.stable_id() != stable_id);
|
||||
|
||||
if let Err(e) = dgram_task.await {
|
||||
tracing::warn!(?remote, error = %e, "T1 datagram task ended unexpectedly");
|
||||
}
|
||||
if let Err(e) = uni_task.await {
|
||||
tracing::warn!(?remote, error = %e, "T2 uni stream task ended unexpectedly");
|
||||
}
|
||||
if let Err(e) = bi_task.await {
|
||||
tracing::warn!(?remote, error = %e, "T3 bi stream task ended unexpectedly");
|
||||
}
|
||||
tracing::info!(?remote, "connection closed");
|
||||
}
|
||||
|
||||
/// T1 — read QUIC datagrams, decode each as a fixed-size `QuicMessage`, push
|
||||
/// into the lossy T1 channel.
|
||||
async fn read_datagrams(conn: Connection, t1: T1Sender) {
|
||||
/// into the lossy T1 channel. Registers the sending device in the connection
|
||||
/// registry on first sight so outbound T3 commands can find this connection.
|
||||
async fn read_datagrams(conn: Connection, t1: T1Sender, registry: ConnectionRegistry) {
|
||||
let remote = conn.remote_address();
|
||||
let mut received: u64 = 0;
|
||||
let mut dropped: u64 = 0;
|
||||
@@ -125,6 +243,7 @@ async fn read_datagrams(conn: Connection, t1: T1Sender) {
|
||||
Ok(msg) => {
|
||||
received += 1;
|
||||
counter!("substrate_received_total", "tier" => "t1").increment(1);
|
||||
ensure_registered(®istry, msg.device_id, &conn);
|
||||
if !t1.send_lossy(msg) {
|
||||
dropped += 1;
|
||||
counter!("substrate_dropped_total", "tier" => "t1").increment(1);
|
||||
@@ -161,7 +280,7 @@ async fn read_datagrams(conn: Connection, t1: T1Sender) {
|
||||
/// reading 38-byte chunks until EOF (one stream may carry one event or many).
|
||||
/// Cross-stream interleaving is allowed; ordering is only guaranteed *within*
|
||||
/// a stream, matching QUIC's stream semantics.
|
||||
async fn read_uni_streams(conn: Connection, t2: T2Sender) {
|
||||
async fn read_uni_streams(conn: Connection, t2: T2Sender, registry: ConnectionRegistry) {
|
||||
let remote = conn.remote_address();
|
||||
let mut streams_accepted: u64 = 0;
|
||||
|
||||
@@ -180,14 +299,22 @@ async fn read_uni_streams(conn: Connection, t2: T2Sender) {
|
||||
};
|
||||
streams_accepted += 1;
|
||||
let t2 = t2.clone();
|
||||
tokio::spawn(read_one_uni_stream(remote, recv, t2));
|
||||
let conn = conn.clone();
|
||||
let registry = registry.clone();
|
||||
tokio::spawn(read_one_uni_stream(remote, recv, t2, conn, registry));
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-stream worker for T2. Reads fixed-size `QuicMessage`s back-to-back,
|
||||
/// awaits backpressure on the T2 channel, and resets the stream on a decode
|
||||
/// failure (one corrupt stream shouldn't take down the whole connection).
|
||||
async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sender) {
|
||||
async fn read_one_uni_stream(
|
||||
remote: SocketAddr,
|
||||
mut recv: RecvStream,
|
||||
t2: T2Sender,
|
||||
conn: Connection,
|
||||
registry: ConnectionRegistry,
|
||||
) {
|
||||
let stream_id: StreamId = recv.id();
|
||||
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
|
||||
let mut count: u64 = 0;
|
||||
@@ -198,6 +325,7 @@ async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sen
|
||||
Ok(msg) => {
|
||||
count += 1;
|
||||
counter!("substrate_received_total", "tier" => "t2").increment(1);
|
||||
ensure_registered(®istry, msg.device_id, &conn);
|
||||
if t2.send(msg).await.is_err() {
|
||||
// T2 receiver dropped (substrate shutting down).
|
||||
tracing::warn!(
|
||||
@@ -236,115 +364,107 @@ async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sen
|
||||
}
|
||||
}
|
||||
|
||||
/// T3 — accept bidirectional streams. Each stream is one command/ack
|
||||
/// exchange, modeled per the paper's "per-command oneshot channels": the
|
||||
/// reader pushes a `T3Inbound { command, reply }` to the ECS, awaits the
|
||||
/// response on `reply_rx`, and writes it back on the same stream.
|
||||
async fn accept_bi_streams(conn: Connection, t3: T3Sender) {
|
||||
let remote = conn.remote_address();
|
||||
let mut streams_accepted: u64 = 0;
|
||||
/// T3 outbound drain — the substrate side of the actuator-command path.
|
||||
///
|
||||
/// Pops `OutboundT3` items the ECS produced, looks up the target device's
|
||||
/// connection in the registry, and **spawns one tokio task per command** to
|
||||
/// do the actual `open_bi() → write → finish → read_ack` round-trip. The
|
||||
/// drain task itself never blocks on a per-command await, so a single stuck
|
||||
/// `read_exact` (e.g. peer dropping mid-stream while Quinn's idle timeout
|
||||
/// counts down) cannot stall the pipeline.
|
||||
///
|
||||
/// Per-stream task records `substrate_latency_us{tier="t3"}` from
|
||||
/// `open_bi()` start to ack-receipt and increments
|
||||
/// `substrate_received_total{tier="t3"}` on success.
|
||||
///
|
||||
/// Per-`(device, sensor)` sequence numbers are owned here so the wire-level
|
||||
/// concerns stay out of the ECS.
|
||||
async fn drain_outbound_t3(registry: ConnectionRegistry, mut rx: mpsc::Receiver<OutboundT3>) {
|
||||
let mut seq_by_target: HashMap<(Uuid, u16), u32> = HashMap::new();
|
||||
|
||||
loop {
|
||||
let (send, recv) = match conn.accept_bi().await {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
while let Some(cmd) = rx.recv().await {
|
||||
let conn = match registry.read().unwrap().get(&cmd.target_device).cloned() {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
counter!("substrate_t3_outbound_no_route_total").increment(1);
|
||||
tracing::debug!(
|
||||
?remote,
|
||||
streams_accepted,
|
||||
error = %e,
|
||||
"T3 bi accept loop ended"
|
||||
device = %cmd.target_device,
|
||||
"outbound T3: no route, dropping"
|
||||
);
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
streams_accepted += 1;
|
||||
let t3 = t3.clone();
|
||||
tokio::spawn(read_one_bi_stream(remote, send, recv, t3));
|
||||
|
||||
let key = (cmd.target_device, cmd.sensor_id);
|
||||
let seq = {
|
||||
let s = seq_by_target.entry(key).or_insert(0);
|
||||
let v = *s;
|
||||
*s = s.wrapping_add(1);
|
||||
v
|
||||
};
|
||||
|
||||
let msg = QuicMessage {
|
||||
device_id: cmd.target_device,
|
||||
sensor_id: cmd.sensor_id,
|
||||
raw_value: cmd.raw_value,
|
||||
timestamp_us: now_us(),
|
||||
sequence_number: seq,
|
||||
sensor_type: cmd.sensor_type,
|
||||
};
|
||||
|
||||
// One task per command. Concurrent in-flight bi-streams are
|
||||
// first-class in QUIC, and this keeps the channel-drain loop hot.
|
||||
tokio::spawn(async move {
|
||||
let started = Instant::now();
|
||||
match send_outbound_t3(&conn, &msg).await {
|
||||
Ok(ack) => {
|
||||
let elapsed_us = started.elapsed().as_micros() as f64;
|
||||
histogram!("substrate_latency_us", "tier" => "t3").record(elapsed_us);
|
||||
counter!("substrate_received_total", "tier" => "t3").increment(1);
|
||||
tracing::trace!(
|
||||
device = %msg.device_id,
|
||||
sensor_id = msg.sensor_id,
|
||||
raw = msg.raw_value,
|
||||
ack_raw = ack.raw_value,
|
||||
elapsed_us,
|
||||
"outbound T3 completed"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
counter!("substrate_t3_outbound_errors_total").increment(1);
|
||||
tracing::warn!(
|
||||
device = %msg.device_id,
|
||||
sensor_id = msg.sensor_id,
|
||||
error = %e,
|
||||
"outbound T3 failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
tracing::info!("outbound T3 drain task exited");
|
||||
}
|
||||
|
||||
/// Per-stream worker for T3. Reads exactly one command, ships it with a
|
||||
/// `oneshot::Sender` to the ECS, awaits the reply, writes it back. If the
|
||||
/// ECS drops the oneshot (no handler installed), the stream is reset so the
|
||||
/// client sees an explicit reset instead of a half-open stream.
|
||||
async fn read_one_bi_stream(
|
||||
remote: SocketAddr,
|
||||
mut send: SendStream,
|
||||
mut recv: RecvStream,
|
||||
t3: T3Sender,
|
||||
) {
|
||||
let stream_id: StreamId = recv.id();
|
||||
/// Single substrate-initiated T3 round-trip: open bi-stream, write command,
|
||||
/// finish send half, read 39-byte ack, decode.
|
||||
async fn send_outbound_t3(conn: &Connection, cmd: &QuicMessage) -> anyhow::Result<QuicMessage> {
|
||||
let (mut send, mut recv) = conn.open_bi().await.context("open_bi for outbound T3")?;
|
||||
send.write_all(&cmd.to_bytes())
|
||||
.await
|
||||
.context("write outbound T3 command")?;
|
||||
send.finish().context("finish outbound T3 send half")?;
|
||||
|
||||
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
|
||||
if let Err(e) = recv.read_exact(&mut buf).await {
|
||||
tracing::trace!(
|
||||
?remote,
|
||||
?stream_id,
|
||||
error = %e,
|
||||
"T3: incomplete command read; closing"
|
||||
);
|
||||
return;
|
||||
}
|
||||
let command = match QuicMessage::decode(&buf) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
counter!("substrate_decode_errors_total", "tier" => "t3").increment(1);
|
||||
tracing::warn!(
|
||||
?remote,
|
||||
?stream_id,
|
||||
error = %e,
|
||||
"T3 command decode failed; resetting stream"
|
||||
);
|
||||
let _ = recv.stop(0u32.into());
|
||||
let _ = send.reset(0u32.into());
|
||||
return;
|
||||
}
|
||||
};
|
||||
counter!("substrate_received_total", "tier" => "t3").increment(1);
|
||||
|
||||
let (reply_tx, reply_rx) = oneshot::channel::<QuicMessage>();
|
||||
let inbound = T3Inbound {
|
||||
command,
|
||||
reply: reply_tx,
|
||||
};
|
||||
if t3.send(inbound).await.is_err() {
|
||||
tracing::warn!(?remote, ?stream_id, "T3 channel closed; abandoning command");
|
||||
let _ = send.reset(0u32.into());
|
||||
return;
|
||||
}
|
||||
|
||||
let response = match reply_rx.await {
|
||||
Ok(msg) => msg,
|
||||
Err(_) => {
|
||||
// ECS dropped the oneshot. With M4's handler installed this
|
||||
// shouldn't happen normally; if it does, the stream is reset so
|
||||
// the client sees a clean signal.
|
||||
counter!("substrate_t3_no_handler_total").increment(1);
|
||||
tracing::debug!(
|
||||
?remote,
|
||||
?stream_id,
|
||||
"T3: no handler for command, resetting stream"
|
||||
);
|
||||
let _ = send.reset(0u32.into());
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = send.write_all(&response.to_bytes()).await {
|
||||
tracing::warn!(
|
||||
?remote,
|
||||
?stream_id,
|
||||
error = %e,
|
||||
"T3 ack write failed"
|
||||
);
|
||||
return;
|
||||
}
|
||||
if let Err(e) = send.finish() {
|
||||
tracing::warn!(
|
||||
?remote,
|
||||
?stream_id,
|
||||
error = %e,
|
||||
"T3 ack finish failed"
|
||||
);
|
||||
}
|
||||
recv.read_exact(&mut buf)
|
||||
.await
|
||||
.context("read outbound T3 ack")?;
|
||||
QuicMessage::decode(&buf).context("decode outbound T3 ack")
|
||||
}
|
||||
|
||||
fn now_us() -> u64 {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.map(|d| d.as_micros() as u64)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user