From d3f09ee0627791b796dfebef38669c62e5d70054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Val=C3=A8re=20Plantevin?= Date: Tue, 12 May 2026 11:21:40 -0400 Subject: [PATCH] First test kinda working --- CLAUDE.md | 83 +++- Makefile | 36 +- config.toml | 4 + dashboards/runtime.json | 148 ++++++++ dashboards/sensors.json | 272 +++++++++++++ data/local/cross_tier.csv | 2 + data/local/scaling.csv | 10 + monitoring/docker-compose.yml | 56 +++ .../provisioning/dashboards/provider.yml | 13 + .../provisioning/datasources/datasource.yml | 13 + monitoring/victoria-metrics/scrape.yml | 14 + scripts/bench-scaling.sh | 248 ++++++++++++ scripts/demo.sh | 222 +++++++++++ simulator/Cargo.toml | 16 +- simulator/src/client.rs | 189 ++++++++++ simulator/src/emitters.rs | 147 ++++++++ simulator/src/lib.rs | 12 + simulator/src/main.rs | 321 +++++++++++++++- simulator/src/profile.rs | 88 +++++ simulator/tests/end_to_end_t1.rs | 139 +++++++ simulator/tests/end_to_end_t2.rs | 163 ++++++++ simulator/tests/end_to_end_t3.rs | 152 ++++++++ substrate/Cargo.toml | 7 +- substrate/src/config.rs | 14 + substrate/src/lib.rs | 4 + substrate/src/main.rs | 21 +- substrate/src/observability.rs | 116 ++++++ substrate/src/transport/ecs.rs | 141 ++++--- substrate/src/transport/mod.rs | 216 ++++++++++- substrate/src/transport/server.rs | 356 +++++++++++++++++- substrate/src/transport/state.rs | 13 + substrate/src/world/components.rs | 97 +++++ substrate/src/world/mod.rs | 52 +++ substrate/src/world/resources.rs | 48 +++ substrate/src/world/systems.rs | 278 ++++++++++++++ substrate/src/world/tests.rs | 294 +++++++++++++++ 36 files changed, 3903 insertions(+), 102 deletions(-) create mode 100644 dashboards/runtime.json create mode 100644 dashboards/sensors.json create mode 100644 data/local/cross_tier.csv create mode 100644 data/local/scaling.csv create mode 100644 monitoring/docker-compose.yml create mode 100644 monitoring/grafana/provisioning/dashboards/provider.yml create mode 100644 monitoring/grafana/provisioning/datasources/datasource.yml create mode 100644 monitoring/victoria-metrics/scrape.yml create mode 100755 scripts/bench-scaling.sh create mode 100755 scripts/demo.sh create mode 100644 simulator/src/client.rs create mode 100644 simulator/src/emitters.rs create mode 100644 simulator/src/lib.rs create mode 100644 simulator/src/profile.rs create mode 100644 simulator/tests/end_to_end_t1.rs create mode 100644 simulator/tests/end_to_end_t2.rs create mode 100644 simulator/tests/end_to_end_t3.rs create mode 100644 substrate/src/lib.rs create mode 100644 substrate/src/observability.rs create mode 100644 substrate/src/transport/state.rs create mode 100644 substrate/src/world/components.rs create mode 100644 substrate/src/world/mod.rs create mode 100644 substrate/src/world/resources.rs create mode 100644 substrate/src/world/systems.rs create mode 100644 substrate/src/world/tests.rs diff --git a/CLAUDE.md b/CLAUDE.md index 47d7690..ed4a4af 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,13 +13,15 @@ Source repo for **"QUIC + ECS as Complementary Transport and Runtime Substrates Three-tier QUIC ↔ ECS bridge, headless Bevy runtime: -| Tier | QUIC primitive | Use case | Channel cap | -|------|----------------|----------|-------------| -| T1 | Unreliable datagrams (RFC 9221) | High-freq ephemeral telemetry; drops OK | 1024, lossy backpressure | -| T2 | Unidirectional streams | Ordered threshold events; reliable | 512, fully drained | -| T3 | Bidirectional streams | Actuator commands w/ ACK | 256, fully drained | +| Tier | QUIC primitive | Use case | Channel cap | Tx newtype | +|------|----------------|----------|-------------|------------| +| T1 | Unreliable datagrams (RFC 9221) | High-freq ephemeral telemetry; drops OK | 1024 | `T1Sender::send_lossy` (try_send, drop on full) | +| T2 | Unidirectional streams | Ordered threshold events; reliable | 512 | `T2Sender::send` (await, backpressure) | +| T3 | Bidirectional streams | Actuator commands w/ ACK; per-command oneshot reply | 256 | `T3Sender::send` of `T3Inbound { command, reply }` | -QUIC server runs on a dedicated OS thread with a Tokio multi-thread runtime; pushes decoded `QuicMessage` (UUID + stream_id + f64 + ts + seq) into `tokio::sync::mpsc` per tier; Bevy `IngestSystem` drains in `PreUpdate`. Pattern is in [substrate/src/transport/ecs.rs](substrate/src/transport/ecs.rs). +QUIC server runs on a dedicated OS thread with a Tokio multi-thread runtime; pushes decoded `QuicMessage` (UUID + sensor_id + f64 + ts + seq, 38 B fixed LE) into `tokio::sync::mpsc` per tier via the `T1Sender / T2Sender / T3Sender` newtypes (in [substrate/src/transport/mod.rs](substrate/src/transport/mod.rs)) so misuse is a type error. Bevy `ingest_system` drains in `PreUpdate`, gated by `run_if(in_state(ServerState::Started))`. Pattern is in [substrate/src/transport/ecs.rs](substrate/src/transport/ecs.rs). + +**T3 ack protocol.** A device opens a bi-stream and writes one `QuicMessage` (the command). The demux task reads it, builds a `T3Inbound { command, reply: oneshot::Sender }`, and sends it on the T3 mpsc. The ECS handler writes the ack into `reply`; the demux task awaits `reply_rx` and writes the resulting `QuicMessage` back on the bi-stream. Dropping the oneshot signals "no handler" and propagates as a stream close — used by the placeholder ingest until M4 installs real handlers. **Target hardware:** CM5 (BCM2712, Cortex-A76, 4 GB) as DT runtime; M4 Max as traffic generator; 1 Gbps direct Ethernet. Both rigs are in hand. @@ -49,13 +51,18 @@ quic_ecs_dt/ | `AppConfig` figment loader (defaults → TOML → env) | Done — [substrate/src/config.rs:42](substrate/src/config.rs#L42) | | 3-tier MPSC bridge scaffolding (Tokio thread + Bevy plugin) | Done — [substrate/src/transport/ecs.rs](substrate/src/transport/ecs.rs) | | `QuicMessage` struct (no codec yet) | Defined — [substrate/src/transport/mod.rs:4](substrate/src/transport/mod.rs#L4) | -| Quinn server (accept loop, demux, decode) | **Empty stub** — [substrate/src/transport/server.rs:4](substrate/src/transport/server.rs#L4) | -| TLS / self-signed cert | Done (M1) — `certs/server.{crt,key}` via `make certs`, gitignored | -| Wire codec for `QuicMessage` (38 B fixed LE) | Done (M1) — [substrate/src/transport/mod.rs:35](substrate/src/transport/mod.rs#L35); 4 unit tests passing | +| Quinn server lifecycle | Listener up — `ServerState{Starting,Started}` in [substrate/src/transport/state.rs](substrate/src/transport/state.rs); `OnEnter(Starting)` → bind + accept loop in [substrate/src/transport/ecs.rs](substrate/src/transport/ecs.rs). Explicit `TransportConfig` w/ tuned datagram recv buffer (256 KiB) in [substrate/src/transport/server.rs](substrate/src/transport/server.rs). Per-tier sender newtypes (`T1Sender::send_lossy`, `T2Sender::send`, `T3Sender::send`) in [substrate/src/transport/mod.rs](substrate/src/transport/mod.rs) | +| T1 demux (datagrams → ECS) | Done — `handle_incoming` orchestrator + `read_datagrams` reader in [substrate/src/transport/server.rs](substrate/src/transport/server.rs); decode errors logged but non-fatal; channel-full drops silent at trace; received/dropped/decode_errors counters in the end-of-stream debug line | +| T2 demux (uni streams → ECS) | Done — `read_uni_streams` accepts streams in [substrate/src/transport/server.rs](substrate/src/transport/server.rs), spawns one task per stream that reads 38 B chunks until EOF; decode failure resets the stream via `recv.stop(0)` (one bad stream doesn't kill the connection); `t2.send().await` honours backpressure | +| T3 demux (bi streams ↔ ECS) | Done — `accept_bi_streams` + `read_one_bi_stream` in [substrate/src/transport/server.rs](substrate/src/transport/server.rs); reads 38 B command, ships `T3Inbound { command, reply: oneshot::Sender }` to the ECS, awaits the reply, writes 38 B ack and finishes. If the ECS drops the oneshot (no handler installed yet — the M4 placeholder) `send.reset(0)` gives the client a clean signal instead of a half-open stream. `handle_incoming` joins all three readers on close | +| TLS / self-signed cert | Done (M1) — `certs/server.{crt,key}` via `make certs`, gitignored. PEM loader in [substrate/src/transport/server.rs:15](substrate/src/transport/server.rs#L15); rustls `aws-lc-rs` default provider installed in [substrate/src/main.rs](substrate/src/main.rs) | +| Wire codec for `QuicMessage` (39 B fixed LE, incl. `sensor_type: u8`) | Done — [substrate/src/transport/mod.rs](substrate/src/transport/mod.rs); 5 unit tests passing. `SensorType` enum: `Generic / Temperature / Humidity / Pressure / Voltage / Current` | | `tracing-subscriber` init w/ `RUST_LOG` | Done (M1) — [substrate/src/main.rs:8-12](substrate/src/main.rs#L8-L12) | -| ECS components (`RawSensorData`) + 5 systems (Ingest/Sim/Export/FaultInjection/Diagnostics) | Missing — placeholder at [substrate/src/transport/ecs.rs:26](substrate/src/transport/ecs.rs#L26) | -| VictoriaMetrics + Grafana export | Missing | -| Simulator (Quinn client + sensor generators) | `Hello, world!` — [simulator/src/main.rs](simulator/src/main.rs) | +| ECS components (`RawSensorData`, `SmoothedValue`) + 5 systems (Ingest/Sim/Export/FaultInjection/Diagnostics) | Done — entities = `(DeviceId, SensorId, SensorTypeTag, RawSensorData, SmoothedValue, Asset)` per (device, sensor); `SensorRegistry` upserts via `HashMap<(Uuid, u16), Entity>` in [substrate/src/world.rs](substrate/src/world.rs). `IngestSystem` drains all three tiers; T3 ack preserves command's `sensor_type` and returns the device's most recent `raw_value`. `SimulationSystem` maintains a 16-sample rolling mean per entity and emits `substrate_threshold_crossings_total{type, direction}` when the smoothed mean crosses a per-type threshold (`Changed` query so cost scales with ingress, not fleet size). `ExportSystem` samples `substrate_{entities,channel_depth,channel_capacity,rss_bytes}` + `sensor_aggregate{type, stat}` once per second. `FaultInjection` is still a stub awaiting M6. `Diagnostics` logs `tick_hz` once per second | +| Schedule rate-gating | Done (M4) — `MinimalPlugins.set(ScheduleRunnerPlugin::run_loop(1/tick_rate_hz))` in [substrate/src/main.rs](substrate/src/main.rs); replaces the default busy-loop with the configured period | +| Prometheus exporter + Grafana dashboards | Done (M5) — `ObservabilityPlugin` in [substrate/src/observability.rs](substrate/src/observability.rs) installs `metrics-exporter-prometheus` on the existing tokio runtime. **Runtime surface** (paper §Evaluation): counters `substrate_received_total{tier}`, `dropped_total{tier=t1}`, `decode_errors_total{tier}`, `t3_no_handler_total`; latency histograms `substrate_latency_us{tier}`; gauges `substrate_tick_hz`, `substrate_entities`, `substrate_channel_depth{tier}`, `substrate_channel_capacity{tier}`, `substrate_rss_bytes`. **Sensor data surface** (operator dashboard): per-type aggregates `sensor_aggregate{type, stat=count|mean|min|max}` computed once per second over the live world, cardinality bounded by `\|SensorType\| × 4` so it scales to thousands of sensors. Two dashboards: [dashboards/runtime.json](dashboards/runtime.json) and [dashboards/sensors.json](dashboards/sensors.json) (thermometer/gauge/stat panels per type) | +| Simulator (Quinn client + sensor generators) | `SimulatorClient` lib in [simulator/src/client.rs](simulator/src/client.rs) — connects, trusts the substrate's PEM cert via custom `ServerCertVerifier` (sidesteps `CaUsedAsEndEntity`); `send_datagram(QuicMessage)` for T1, `send_uni_stream(&[QuicMessage])` for T2, `request(&QuicMessage) -> QuicMessage` for T3. CLI driver in [simulator/src/main.rs](simulator/src/main.rs) with clap flags (`--addr`, `--rate-hz`, `--t2-rate-hz`, `--t3-rate-hz`, `--t3-timeout-ms`, `--count`, `--devices`, `--sensor-id`, `--sensor-type`, `--profile`, `--cert`, `--server-name`); parallel T1+T2+T3 emitters, per-(device,sensor) sequence counters, type-appropriate waveform generators (sin/cos curves centred on realistic sensor ranges), 1-Hz combined progress logs, Ctrl-C drain. `--profile industrial` fans out to 5 sensors per device (Temperature/Humidity/Pressure/Voltage/Current). Bevy-driven sensor generator still pending | +| End-to-end test harness | Six integration tests across [simulator/tests/end_to_end_t1.rs](simulator/tests/end_to_end_t1.rs), [simulator/tests/end_to_end_t2.rs](simulator/tests/end_to_end_t2.rs), [simulator/tests/end_to_end_t3.rs](simulator/tests/end_to_end_t3.rs): T1 single-datagram round-trip + 32-msg burst order; T2 single-stream order-preservation + 4-stream concurrent per-device ordering; T3 round-trip with fake-ECS handler + no-handler stream-reset. Each test calls `bind_endpoint` + `accept_loop` in-process with channels owned by the test | | `config.toml` at repo root | Done (M1) — [config.toml](config.toml); loaded by [substrate/src/main.rs:9](substrate/src/main.rs#L9) | | Benchmark harness (sweep + CSV writer) | Missing | | CM5 cross-compile / deploy | Wired in [Makefile:30](Makefile#L30); not exercised | @@ -67,10 +74,14 @@ quic_ecs_dt/ Each milestone has one verification gate. Update Status here as we go. - **M1 — Wire codec & root config.** ✅ Done 2026-05-04. Hand-rolled little-endian codec on `QuicMessage` (38 B fixed: 16 UUID + 2 stream_id + 8 f64 + 8 ts_us + 4 seq) with roundtrip + layout + length-error tests; `config.toml` at repo root; dev TLS via `make certs`; structured `tracing-subscriber` init reads `RUST_LOG` (default `info`). -- **M2 — Quinn server + self-signed TLS.** Fill [substrate/src/transport/server.rs](substrate/src/transport/server.rs): `Endpoint::server`, accept loop, demux T1=datagrams / T2=uni / T3=bi, push into matching `mpsc::Sender`. Use `rcgen` for a dev cert at boot. *Verify:* a Quinn smoke client connects, server logs handshake. -- **M3 — Simulator client.** Replace [simulator/src/main.rs](simulator/src/main.rs) with a Bevy app: Quinn client, N synthetic devices, configurable per-tier rates. *Verify:* end-to-end loopback drains messages on all three tiers. -- **M4 — ECS world.** Define `RawSensorData` and the 5 systems the paper names (`FaultInjectionSystem`, `IngestSystem`, `SimulationSystem`, `ExportSystem`, `DiagnosticsSystem`). Wire `IngestSystem` into the existing `PreUpdate` slot. *Verify:* with 10k simulated devices, entity count stabilizes; `DiagnosticsSystem` logs steady tick rate. -- **M5 — Observability (VictoriaMetrics + Grafana).** Substrate exposes Prometheus-format `/metrics` (use `metrics` + `metrics-exporter-prometheus`): tick rate, RSS, per-tier P50/P99/P999, channel depth, drop count. Commit a Grafana dashboard JSON. *Verify:* `curl :PORT/metrics` returns labeled samples; dashboard renders against VM. +- **M2 — Quinn server + self-signed TLS.** ✅ Done 2026-05-06. Listener up under `ServerState::Starting/Started`; type-system tier semantics + T3 oneshot ack protocol; per-connection `handle_incoming` orchestrator joining T1 datagram, T2 uni-stream, and T3 bi-stream readers. T1 has dropped/decoded counters; T2 resets a stream on decode failure without killing the connection; T3 ships `T3Inbound { command, reply }` to the ECS and resets the stream when no handler answers. End-to-end coverage: 6 integration tests in [simulator/tests/](simulator/tests/) plus 4 codec unit tests, all green. +- **M3 — Simulator client.** Replace [simulator/src/main.rs](simulator/src/main.rs) with a Bevy app: Quinn client, N synthetic devices, configurable per-tier rates. *Verify:* end-to-end loopback drains messages on all three tiers. **Status (2026-05-05):** simulator made into a lib + bin; `SimulatorClient::{connect,send_datagram,close}` plus a manual smoke runner in `simulator/src/main.rs`. Two integration tests in `simulator/tests/end_to_end_t1.rs` exercise the full T1 path against an in-process substrate. Bevy-driven generator + T2/T3 helpers + load profiles still pending. +- **M4 — ECS world.** ✅ Done. `Asset` + `DeviceId` + `SensorId` + `SensorTypeTag` + `RawSensorData` + `SmoothedValue` components in [substrate/src/world.rs](substrate/src/world.rs); `SensorRegistry` resource for O(1) `(Uuid, u16) → Entity`. `IngestSystem` drains all three tiers (T1 batched, T2/T3 fully); T3 handler returns the latest sensor value as ack. `SimulationSystem` runs a per-entity 16-sample rolling mean and emits `substrate_threshold_crossings_total{type, direction}` on per-type threshold crossings — gives the ECS observable digital-twin work, not just write-through ingest. `ExportSystem` samples `substrate_{entities,channel_depth,channel_capacity,rss_bytes}` + `sensor_aggregate{type, stat}` once per second. `FaultInjection` still a stub (M6). `DiagnosticsSystem` logs tick rate once per second. Schedule rate-gated via `ScheduleRunnerPlugin::run_loop(1/tick_rate_hz)`. 8 unit tests passing (entity create, in-place update, T3 ack, SmoothedValue push/window/non-finite/full-roll, threshold-crossing transition). +- **M5 — Observability (VictoriaMetrics + Grafana).** ✅ Done. Wire format extended to carry `sensor_type: u8` (38 → 39 B, decoded into `SensorType` enum). Two metric surfaces over `metrics-exporter-prometheus`: + - **Runtime** (paper §Evaluation): `substrate_received_total{tier}`, `dropped_total{tier=t1}`, `decode_errors_total{tier}`, `t3_no_handler_total`, `latency_us{tier}` histograms, `tick_hz` / `entities` / `channel_depth{tier}` / `rss_bytes` gauges. + - **Sensor data** (operator surface): `sensor_aggregate{type, stat=count|mean|min|max}` aggregated per second across the live ECS world. Cardinality bounded to `\|SensorType\| × 4` series independent of physical sensor count. + - Dashboards: [dashboards/runtime.json](dashboards/runtime.json) + [dashboards/sensors.json](dashboards/sensors.json). + - Verified: `--profile industrial --devices 2 --count 200` yields 10 entities and all 5 type aggregates with realistic values (T=20.5°C, RH=51%, P=1018 hPa, V=230.2 V, I=12 A). - **M6 — Benchmark harness.** Sweep `entity_count ∈ {10k, 50k, 100k, 200k}` × `loss_rate ∈ {0%, 1%, 5%}` with 2k warmup + 5k measurement ticks. Loss via `tc netem` or in-app injection. Writes `data/loopback/final_table.csv`. *Verify:* one full sweep on M4 Max produces a CSV the Quarto figures consume. - **M7 — CM5 cross-compile & deploy.** Exercise [Makefile:30](Makefile#L30) (`build-cm5`, `deploy-cm5`); set real `CM5_HOST`. *Verify:* binary runs on CM5 with a feed from M4 Max over 1 Gbps Ethernet. - **M8 — Two-machine run + paper render.** Sweep with simulator on M4 Max → substrate on CM5; populate `data/two_machine/final_table.csv`; `make render` produces a PDF. **Update §Evaluation prose to reflect actual numbers.** Current paper figures (241 Hz, 64 µs / 15.8 ms P99, 2.6 µs jitter, 1.02 MB/1k, R²=0.9999) are **aspirational placeholders** — they may move and the conclusions may shift; that's expected. @@ -85,6 +96,17 @@ Each milestone has one verification gate. Update Status here as we go. - **Paper:** Quarto + LNCS template ([paper/_extensions/template.tex](paper/_extensions/template.tex), [paper/_quarto.yml](paper/_quarto.yml)). **Never commit `llncs.cls` or `splncs04.bst`** — CTAN licensing; download per [README.md:25-34](README.md#L25-L34). - **Data:** raw CSVs under `data/` are committed; `*_processed.csv` is gitignored. Paper figures consume `data/loopback/final_table.csv` and `data/two_machine/final_table.csv`. - **Build artifacts:** `target/`, `paper/_output/`, `paper/figures/`, `paper/.quarto/`, `paper/index.tex` all gitignored. +- **Errors:** `anyhow` (with `.context()`) for internal startup paths where the error type is uninteresting; `thiserror` for boundary types we want to match against (e.g. `WireError` in the codec). +- **Warnings:** let real warnings show. No `#[allow(dead_code)]`, `_var` blanket suppression, or `PhantomData` shims to silence the compiler — warnings are honest TODO markers and disappear when the consuming code lands. See [feedback memory](../../.claude/projects/-Users-vplantevin-Projects-Research-quic-ecs-dt/memory/feedback_no_warning_hacks.md). + +## Known deferrals + +- **Channel ownership is per-host, not per-connection.** All connections share the same three mpsc channels. Fairness under N-device load relies on tokio scheduling. Acceptable for the "one ECS world per host" model the paper describes; revisit if many-device benchmarks show starvation. +- **No graceful shutdown.** The `quic-runtime` thread is parked on `pending()`; spawned tasks (accept loop, per-conn demux) are orphaned at process exit. Fine for research runs; we'll need an `OnExit(Started)` (or a `Stopping` state) when M5 observability needs clean drain or M8 wants finalised CSV writes. +- **Bind failure is fatal.** `OnEnter(Starting)` panics if `bind_endpoint` fails. A `ServerState::Failed` variant joins when we wire proper error surfacing. +- **T3 ack semantics are minimal.** The current handler echoes the device's most recent `raw_value` with a server timestamp — adequate for "read sensor" commands, not for actuator-write semantics. A future iteration may introduce an `ActuatorState` component and a setpoint-apply path; for now T3 is best framed as "reliable read/query RPC" in the paper. +- **`FaultInjectionSystem` is still empty.** Runs on schedule but does nothing. M6 fills it with rate-controlled in-app drop so loss sweeps don't depend on external `tc netem`. +- **Schedule rate-gating is approximate.** `ScheduleRunnerPlugin::run_loop(period)` honours `period` as a minimum; observed `tick_hz` runs ~85% of target on macOS dev (target 60 → ~50). Should be tighter on the CM5; revisit if M6 sweeps depend on a steady tick. ## Run / verify @@ -100,6 +122,35 @@ make clean # cargo clean + drop generated paper outputs `certs/` is gitignored; `make build` regenerates the dev cert if missing. From the repo root: `cargo run -p substrate` boots, prints the loaded `AppConfig`, and idles. `config.toml` and cert paths are resolved relative to the cwd — always launch from the repo root. +**Tests.** `cargo test --workspace` runs the codec unit tests in `substrate` plus the end-to-end integration tests in [simulator/tests/](simulator/tests/). Each integration test calls `bind_endpoint` + `accept_loop` in-process on `127.0.0.1:0` (OS-assigned port), connects a `SimulatorClient` against it, and asserts what arrives on the test-owned T1 receiver. Add a new `simulator/tests/end_to_end_*.rs` for each new wire path (T2 uni, T3 bi) as the substrate-side demux lands. + +**Metrics scrape.** With `metrics_enabled = true` (default), the substrate exposes a Prometheus-format endpoint: + +```bash +curl http://127.0.0.1:9100/metrics +``` + +A docker-compose stack under [monitoring/](monitoring/) brings up VictoriaMetrics + Grafana auto-provisioned: `make monitoring-up` then Grafana at (admin / admin), both dashboards under the `quic_ecs_dt` folder. The compose mounts [dashboards/](dashboards/) directly so any edit to the JSON files re-imports within 10 s. + +Two Grafana dashboards under [dashboards/](dashboards/): + +- [`runtime.json`](dashboards/runtime.json) — tick rate, RSS, per-tier received/dropped/latency, channel depth (paper §Evaluation surface). +- [`sensors.json`](dashboards/sensors.json) — thermometer + gauges + stat panels per `SensorType`, driven by `sensor_aggregate{type, stat}` (operator-facing surface). + +Both use the `${datasource}` template variable so you can point them at any Prometheus-compatible source. + +**Manual two-process run.** From the repo root, in two shells: + +```bash +# shell 1 — server (use RUST_LOG=substrate=debug to see the per-conn summary) +cargo run -p substrate + +# shell 2 — client; --help shows all flags +cargo run -p simulator -- --rate-hz 100 --count 0 --devices 4 +``` + +Simulator flags (see `cargo run -p simulator -- --help`): `--addr`, `--server-name`, `--cert`, `--rate-hz` (T1 datagram rate; `0` disables T1), `--t2-rate-hz` / `--t3-rate-hz` (per-tier event rate; `0` disables), `--t3-timeout-ms` (T3 ack wait, default `2000`), `--count` (T1 count; `0` = until Ctrl-C), `--devices`, `--sensor-id`, `--sensor-type` (one of `generic|temperature|humidity|pressure|voltage|current`), `--profile` (`single` or `industrial` — 5 sensors per device on ids 0..4 covering all types). The client logs a one-second `progress` line with `t1_sent`/`t2_sent`/`t3_sent`/`t3_timeouts`/per-tier observed Hz, and a final `simulator done` line with elapsed time on exit. + ## Key references - Prior self-citations: `plantevin2026ecs`, `plantevin2026quic` (both IEEE SWC 2026, "to appear"). diff --git a/Makefile b/Makefile index 6a02b68..7439ec4 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,19 @@ # ============================================================ # quic_ecs_dt — top-level Makefile # Targets: -# make render — build the paper PDF -# make preview — live-reload preview in browser -# make build — cargo build --release (native) -# make build-cm5 — cargo build --release (aarch64 cross) -# make clean — remove generated outputs +# make demo — one-shot: certs → build → VM+Grafana → +# substrate → simulator (Ctrl-C cleans up) +# make render — build the paper PDF +# make preview — live-reload preview in browser +# make build — cargo build --release (native) +# make build-cm5 — cargo build --release (aarch64 cross) +# make monitoring-up — start VictoriaMetrics + Grafana (docker) +# make monitoring-down — stop them +# make monitoring-logs — tail the monitoring stack +# make clean — remove generated outputs # ============================================================ -.PHONY: render preview build build-cm5 clean certs +.PHONY: render preview build build-cm5 clean certs monitoring-up monitoring-down monitoring-logs demo VENV := $(HOME)/.venv/quic_ecs PYTHON := $(VENV)/bin/python @@ -53,6 +58,25 @@ deploy-cm5: build-cm5 scp target/aarch64-unknown-linux-gnu/release/quic_ecs_dt \ $(CM5_USER)@$(CM5_HOST):$(CM5_BIN_DIR)/ +# One-shot demo runner — see scripts/demo.sh +demo: + @./scripts/demo.sh + +# Monitoring (VictoriaMetrics + Grafana, auto-provisioned) +monitoring-up: + docker compose -f monitoring/docker-compose.yml up -d + @echo "" + @echo "Grafana: http://localhost:3000 (admin / admin, or anonymous Admin)" + @echo " • runtime dashboard: quic_ecs_dt → quic_ecs_dt — substrate runtime" + @echo " • sensors dashboard: quic_ecs_dt → quic_ecs_dt — sensors" + @echo "VictoriaMetrics: http://localhost:8428" + +monitoring-down: + docker compose -f monitoring/docker-compose.yml down + +monitoring-logs: + docker compose -f monitoring/docker-compose.yml logs -f + # Clean clean: cargo clean diff --git a/config.toml b/config.toml index cc8b0ad..1d4a3b9 100644 --- a/config.toml +++ b/config.toml @@ -16,3 +16,7 @@ server_key = "certs/server.key" [simulation] tick_rate_hz = 60 max_entities = 10000 + +[observability] +metrics_enabled = true +metrics_listen = "0.0.0.0:9100" diff --git a/dashboards/runtime.json b/dashboards/runtime.json new file mode 100644 index 0000000..8ced9b2 --- /dev/null +++ b/dashboards/runtime.json @@ -0,0 +1,148 @@ +{ + "title": "quic_ecs_dt — substrate runtime", + "uid": "quic-ecs-dt-runtime", + "schemaVersion": 39, + "version": 1, + "timezone": "", + "refresh": "5s", + "time": { "from": "now-15m", "to": "now" }, + "tags": ["quic_ecs_dt", "ucami2026", "substrate"], + "templating": { + "list": [ + { + "name": "datasource", + "label": "Data source", + "type": "datasource", + "query": "prometheus", + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0 + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Tick rate (Hz)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "hertz", "decimals": 1 } }, + "targets": [ + { "expr": "substrate_tick_hz", "refId": "A", "legendFormat": "tick_hz" } + ] + }, + { + "id": 2, + "title": "Entities", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { "expr": "substrate_entities", "refId": "A", "legendFormat": "entities" } + ] + }, + { + "id": 3, + "title": "RSS", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "bytes", "decimals": 1 } }, + "targets": [ + { "expr": "substrate_rss_bytes", "refId": "A", "legendFormat": "rss" } + ] + }, + { + "id": 4, + "title": "T3 — no handler events (cumulative)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { "expr": "substrate_t3_no_handler_total", "refId": "A", "legendFormat": "no_handler" } + ] + }, + { + "id": 5, + "title": "Per-tier receive rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "cps" } }, + "targets": [ + { + "expr": "rate(substrate_received_total[1m])", + "refId": "A", + "legendFormat": "received {{tier}}" + }, + { + "expr": "rate(substrate_dropped_total[1m])", + "refId": "B", + "legendFormat": "dropped {{tier}}" + } + ] + }, + { + "id": 6, + "title": "Per-tier latency (µs)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "µs", "decimals": 0 } }, + "targets": [ + { + "expr": "substrate_latency_us{quantile=\"0.5\"}", + "refId": "A", + "legendFormat": "p50 {{tier}}" + }, + { + "expr": "substrate_latency_us{quantile=\"0.99\"}", + "refId": "B", + "legendFormat": "p99 {{tier}}" + }, + { + "expr": "substrate_latency_us{quantile=\"0.999\"}", + "refId": "C", + "legendFormat": "p999 {{tier}}" + } + ] + }, + { + "id": 7, + "title": "Channel depth (vs. capacity)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { + "expr": "substrate_channel_depth", + "refId": "A", + "legendFormat": "depth {{tier}}" + }, + { + "expr": "substrate_channel_capacity", + "refId": "B", + "legendFormat": "capacity {{tier}}" + } + ] + }, + { + "id": 8, + "title": "Decode errors (rate)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "cps" } }, + "targets": [ + { + "expr": "rate(substrate_decode_errors_total[1m])", + "refId": "A", + "legendFormat": "decode_errors {{tier}}" + } + ] + } + ] +} diff --git a/dashboards/sensors.json b/dashboards/sensors.json new file mode 100644 index 0000000..057cc20 --- /dev/null +++ b/dashboards/sensors.json @@ -0,0 +1,272 @@ +{ + "title": "quic_ecs_dt — sensors", + "uid": "quic-ecs-dt-sensors", + "schemaVersion": 39, + "version": 1, + "timezone": "", + "refresh": "1s", + "time": { "from": "now-5m", "to": "now" }, + "tags": ["quic_ecs_dt", "ucami2026", "sensors"], + "templating": { + "list": [ + { + "name": "datasource", + "label": "Data source", + "type": "datasource", + "query": "prometheus", + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0 + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Temperature — mean (thermometer)", + "type": "gauge", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "options": { + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "vertical" + }, + "fieldConfig": { + "defaults": { + "unit": "celsius", + "decimals": 1, + "min": -20, + "max": 80, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "blue", "value": null }, + { "color": "green", "value": 10 }, + { "color": "yellow", "value": 30 }, + { "color": "orange", "value": 50 }, + { "color": "red", "value": 70 } + ] + } + } + }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"temperature\", stat=\"mean\"}", + "refId": "A", + "legendFormat": "T mean" + } + ] + }, + { + "id": 2, + "title": "Humidity — mean", + "type": "gauge", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "options": { "showThresholdMarkers": true, "orientation": "vertical" }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 1, + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "orange", "value": null }, + { "color": "green", "value": 30 }, + { "color": "blue", "value": 70 } + ] + } + } + }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"humidity\", stat=\"mean\"}", + "refId": "A", + "legendFormat": "RH mean" + } + ] + }, + { + "id": 3, + "title": "Pressure — mean", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "options": { "graphMode": "area", "colorMode": "value" }, + "fieldConfig": { + "defaults": { + "unit": "pressurehpa", + "decimals": 1, + "min": 980, + "max": 1040, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "blue", "value": null }, + { "color": "green", "value": 1000 }, + { "color": "yellow", "value": 1025 } + ] + } + } + }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"pressure\", stat=\"mean\"}", + "refId": "A", + "legendFormat": "P mean" + } + ] + }, + { + "id": 4, + "title": "Voltage — mean", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "options": { "graphMode": "area", "colorMode": "value" }, + "fieldConfig": { + "defaults": { + "unit": "volt", + "decimals": 2, + "min": 220, + "max": 240, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "yellow", "value": null }, + { "color": "green", "value": 225 }, + { "color": "yellow", "value": 235 } + ] + } + } + }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"voltage\", stat=\"mean\"}", + "refId": "A", + "legendFormat": "V mean" + } + ] + }, + { + "id": 5, + "title": "Current — mean", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "options": { "graphMode": "area", "colorMode": "value" }, + "fieldConfig": { + "defaults": { + "unit": "amp", + "decimals": 2, + "min": 0, + "max": 30, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 20 }, + { "color": "red", "value": 25 } + ] + } + } + }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"current\", stat=\"mean\"}", + "refId": "A", + "legendFormat": "I mean" + } + ] + }, + { + "id": 6, + "title": "Sensor count by type", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "options": { "colorMode": "value", "graphMode": "none" }, + "targets": [ + { + "expr": "sensor_aggregate{stat=\"count\"}", + "refId": "A", + "legendFormat": "{{type}}" + } + ] + }, + { + "id": 7, + "title": "Temperature — min / mean / max over time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "celsius", "decimals": 1 } }, + "targets": [ + { + "expr": "sensor_aggregate{type=\"temperature\", stat=\"min\"}", + "refId": "A", + "legendFormat": "min" + }, + { + "expr": "sensor_aggregate{type=\"temperature\", stat=\"mean\"}", + "refId": "B", + "legendFormat": "mean" + }, + { + "expr": "sensor_aggregate{type=\"temperature\", stat=\"max\"}", + "refId": "C", + "legendFormat": "max" + } + ] + }, + { + "id": 8, + "title": "All sensor types — mean over time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short", "decimals": 2 } }, + "targets": [ + { + "expr": "sensor_aggregate{stat=\"mean\"}", + "refId": "A", + "legendFormat": "{{type}}" + } + ] + }, + { + "id": 9, + "title": "Threshold crossings (cumulative) — per type / direction", + "description": "Each time a sensor's smoothed mean crosses its per-type threshold, simulation_system increments the counter. up = rising through threshold; down = falling through. The counter being non-zero is the load-bearing evidence that the ECS runs the digital-twin transform — not just write-through ingest.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { + "expr": "substrate_threshold_crossings_total", + "refId": "A", + "legendFormat": "{{type}} {{direction}}" + } + ] + }, + { + "id": 10, + "title": "Threshold crossings — rate (events/min)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "cpm" } }, + "targets": [ + { + "expr": "60 * rate(substrate_threshold_crossings_total[1m])", + "refId": "A", + "legendFormat": "{{type}} {{direction}}" + } + ] + } + ] +} diff --git a/data/local/cross_tier.csv b/data/local/cross_tier.csv new file mode 100644 index 0000000..a731270 --- /dev/null +++ b/data/local/cross_tier.csv @@ -0,0 +1,2 @@ +rate_hz,t3_rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t3_received,t3_no_handler,t3_p50_us,t3_p99_us,t3_p999_us,tick_hz,rss_mb,channel_depth_max +100,100,100,0,25,2646,0,118.99720565324648,202.0065277946852,245.99224556720532,2646,0,120.98904580793433,199.99652925270829,238.0069829199846,15833.3,28.0,0 diff --git a/data/local/scaling.csv b/data/local/scaling.csv new file mode 100644 index 0000000..bdbd3c8 --- /dev/null +++ b/data/local/scaling.csv @@ -0,0 +1,10 @@ +rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,tick_hz,rss_mb,channel_depth_max +100,100,0,25,2715,0,10287.656173771804,20683.6751522136,20899.90783549675,52.1,28.2,1 +500,100,0,25,13595,0,9945.744255905174,20441.042134756957,20879.018374063122,51.0,29.8,1 +1000,100,0,25,27324,0,9858.605678238058,20371.66060670275,20862.321838812768,51.6,30.3,2 +5000,100,0,25,136305,0,9700.182954474827,20144.770960915914,20803.98904149668,52.2,31.4,10 +10000,100,0,25,273443,0,9680.801975940145,20164.925807687836,20874.842987926906,51.9,31.9,10 +25000,100,0,25,685150,0,9466.362697231909,19813.128013911944,20766.575543347255,51.6,33.2,50 +50000,100,0,25,1371659,4515,9349.704574533685,19635.60989099387,20477.86914508828,51.5,33.3,100 +100000,100,0,25,2740689,1266351,13177.946960597013,20502.4573381096,28455.593524841766,53.0,35.2,200 +250000,100,0,25,6826035,5353528,16234.599694958577,20696.089081152582,22046.299162128806,53.2,35.6,747 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000..48d4850 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,56 @@ +# VictoriaMetrics + Grafana for `quic_ecs_dt` local demos. +# +# Run from the repo root (or via `make monitoring-up`). The substrate runs on +# the host and exposes /metrics on :9100; VM scrapes it via +# `host.docker.internal`, which works on Docker Desktop (mac/Windows) and on +# recent Docker Engine on Linux thanks to the `extra_hosts` mapping below. +# +# Grafana auto-provisions: +# • a Prometheus-typed data source pointing at VM +# • both dashboards from ../dashboards (runtime + sensors) +# +# Endpoints: +# • Grafana http://localhost:3000 (anonymous Admin) +# • VictoriaMetrics http://localhost:8428 +# • Substrate /metrics http://localhost:9100/metrics (on the host) + +services: + victoria-metrics: + image: victoriametrics/victoria-metrics:v1.115.0 + container_name: quic_ecs_dt_vm + ports: + - "8428:8428" + command: + - "-promscrape.config=/etc/vm/scrape.yml" + - "-retentionPeriod=1d" + - "-storageDataPath=/storage" + volumes: + - ./victoria-metrics/scrape.yml:/etc/vm/scrape.yml:ro + - vm-data:/storage + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + + grafana: + image: grafana/grafana:11.4.0 + container_name: quic_ecs_dt_grafana + ports: + - "3000:3000" + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=false + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_DEFAULT_THEME=dark + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ../dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + depends_on: + - victoria-metrics + restart: unless-stopped + +volumes: + vm-data: + grafana-data: diff --git a/monitoring/grafana/provisioning/dashboards/provider.yml b/monitoring/grafana/provisioning/dashboards/provider.yml new file mode 100644 index 0000000..7eaf73a --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/provider.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: quic_ecs_dt + orgId: 1 + folder: "quic_ecs_dt" + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..569db64 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://victoria-metrics:8428 + isDefault: true + editable: true + jsonData: + timeInterval: "1s" + httpMethod: "POST" diff --git a/monitoring/victoria-metrics/scrape.yml b/monitoring/victoria-metrics/scrape.yml new file mode 100644 index 0000000..9c29ff9 --- /dev/null +++ b/monitoring/victoria-metrics/scrape.yml @@ -0,0 +1,14 @@ +# VictoriaMetrics scrape config — uses Prometheus-compatible syntax. +# 1-second interval gives Grafana something to redraw every refresh tick. + +global: + scrape_interval: 1s + scrape_timeout: 800ms + +scrape_configs: + - job_name: substrate + static_configs: + - targets: + - "host.docker.internal:9100" + labels: + instance: "substrate-local" diff --git a/scripts/bench-scaling.sh b/scripts/bench-scaling.sh new file mode 100755 index 0000000..9fe9447 --- /dev/null +++ b/scripts/bench-scaling.sh @@ -0,0 +1,248 @@ +#!/usr/bin/env bash +# scripts/bench-scaling.sh — M6-lite: sweep T1 rate at fixed entity count, +# record tick_hz / P99 latency / drops / RSS into a CSV the paper can plot. +# +# Two modes: +# +# 1. Scaling sweep (default). Just T1 traffic. Tells you the substrate's +# throughput ceiling on this host and where the lossy-tier kicks in. +# Output: data/local/scaling.csv +# +# 2. Cross-tier isolation. Set T3_RATE_HZ= to run a constant T3 baseline +# in parallel with the T1 sweep. The CSV gains substrate-side T3 latency +# columns. If T3 P99 stays flat as T1 climbs orders of magnitude, the +# paper's composition thesis is supported. +# Output: data/local/cross_tier.csv +# +# Holds: +# - tick_rate_hz $TICK_RATE_HZ (default 1000; set 0 for busy-loop) +# - device count $DEVICES (default 100, single-sensor profile) +# - window $WINDOW_S (default 20s steady-state per rate) +# - T3 baseline $T3_RATE_HZ (default 0 = disabled) +# - T3 timeout $T3_TIMEOUT_MS (default 2000ms) +# - build profile $BUILD (release | debug; default release) +# +# Sweeps: +# T1 rate over the positional arguments, or these defaults: +# 100 500 1000 5000 10000 25000 50000 +# +# Examples: +# # Pure T1 scaling sweep. +# ./scripts/bench-scaling.sh +# +# # Cross-tier isolation: hold T3 at 100 Hz, sweep T1. +# T3_RATE_HZ=100 ./scripts/bench-scaling.sh +# +# # Custom sweep, longer windows. +# DEVICES=1000 WINDOW_S=30 ./scripts/bench-scaling.sh 1000 5000 20000 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$ROOT" + +# --- knobs --- +DEVICES="${DEVICES:-100}" +TICK_RATE_HZ="${TICK_RATE_HZ:-1000}" +WARMUP_S="${WARMUP_S:-3}" +WINDOW_S="${WINDOW_S:-20}" +T3_RATE_HZ="${T3_RATE_HZ:-0}" +T3_TIMEOUT_MS="${T3_TIMEOUT_MS:-2000}" +BUILD="${BUILD:-release}" +RATES=("${@}") +if [[ ${#RATES[@]} -eq 0 ]]; then + RATES=(100 500 1000 5000 10000 25000 50000) +fi + +# Pick default output path based on mode so the two CSVs don't clobber. +CROSS_TIER=$(awk -v r="$T3_RATE_HZ" 'BEGIN { print (r+0 > 0) ? "1" : "0" }') +if [[ "$CROSS_TIER" == "1" ]]; then + OUT_CSV="${OUT_CSV:-data/local/cross_tier.csv}" +else + OUT_CSV="${OUT_CSV:-data/local/scaling.csv}" +fi + +# --- pretty logging --- +if [[ -t 1 ]]; then + BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'; RED=$'\033[31m'; RESET=$'\033[0m' +else BOLD=; DIM=; GREEN=; RED=; RESET=; fi +step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; } +ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; } +fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; } + +# --- prereqs --- +for cmd in cargo curl lsof awk; do + command -v "$cmd" >/dev/null || { fail "missing: $cmd"; exit 1; } +done +for port in 9000 9100; do + if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then + fail "port $port in use — kill the running substrate first" + exit 1 + fi +done +[[ -f certs/server.crt ]] || make certs >/dev/null + +# --- build --- +step "Building ($BUILD)" +if [[ "$BUILD" == "release" ]]; then + cargo build --release -p substrate -p simulator >/dev/null + SUBSTRATE="$ROOT/target/release/substrate" + SIMULATOR="$ROOT/target/release/simulator" +else + cargo build -p substrate -p simulator >/dev/null + SUBSTRATE="$ROOT/target/debug/substrate" + SIMULATOR="$ROOT/target/debug/simulator" +fi + +# --- start substrate with high tick rate --- +LOG_DIR="/tmp/quic_ecs_dt_bench" +mkdir -p "$LOG_DIR" +SUB_LOG="$LOG_DIR/substrate.log" +: > "$SUB_LOG" + +step "Starting substrate (tick_rate_hz=$TICK_RATE_HZ, log: $SUB_LOG)" +APP_SIMULATION__TICK_RATE_HZ="$TICK_RATE_HZ" RUST_LOG=warn "$SUBSTRATE" >"$SUB_LOG" 2>&1 & +SUBSTRATE_PID=$! + +# Wait for /metrics +for i in $(seq 1 40); do + if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then + ok "substrate /metrics ready"; break + fi + sleep 0.25 + if [[ $i -eq 40 ]]; then fail "substrate didn't start"; tail -20 "$SUB_LOG"; exit 1; fi +done + +cleanup() { + [[ -n "${SIM_PID:-}" ]] && kill -TERM "$SIM_PID" 2>/dev/null || true + [[ -n "${SUBSTRATE_PID:-}" ]] && kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true + wait 2>/dev/null || true +} +trap cleanup EXIT INT TERM + +# --- helpers to scrape a single value out of /metrics text --- +snapshot_to() { + curl -s http://localhost:9100/metrics > "$1" +} +get_value() { + # $1: snapshot file, $2: full metric name (regex-anchored at line start) + awk -v pat="$2" '$0 ~ "^" pat " " { print $NF; exit }' "$1" +} + +# --- sweep --- +mkdir -p "$(dirname "$OUT_CSV")" +echo "rate_hz,t3_rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t3_received,t3_no_handler,t3_p50_us,t3_p99_us,t3_p999_us,tick_hz,rss_mb,channel_depth_max" > "$OUT_CSV" + +if [[ "$CROSS_TIER" == "1" ]]; then + step "Sweeping T1 + holding T3 at ${T3_RATE_HZ} Hz (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)" +else + step "Sweeping T1 rate (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)" +fi +printf '%s' "$BOLD" +if [[ "$CROSS_TIER" == "1" ]]; then + printf '%-8s %-9s %-9s %-10s %-10s %-8s %-9s %-10s %-10s %-8s %-7s\n' \ + "rate" "t1_recv" "t1_drop" "t1_p50" "t1_p99" "t3_recv" "t3_p50" "t3_p99" "t3_p999" "tick_hz" "rss_mb" +else + printf '%-8s %-9s %-9s %-10s %-10s %-10s %-8s %-7s\n' \ + "rate" "received" "dropped" "p50_us" "p99_us" "p999_us" "tick_hz" "rss_mb" +fi +printf '%s' "$RESET" + +# Snapshot file paths +BEFORE="$LOG_DIR/before.txt" +AFTER="$LOG_DIR/after.txt" + +# Peak-tracker for channel depth: tail /metrics at 4 Hz during the window +peak_depth() { + local label="$1" # "t1" or "t2" or "t3" + local max=0 + local val + for _ in $(seq 1 $(( WINDOW_S * 4 ))); do + val=$(curl -s http://localhost:9100/metrics 2>/dev/null \ + | awk -v pat="^substrate_channel_depth\\\\{tier=\"$label\"\\\\}" '$0 ~ pat {print $NF; exit}') + if [[ -n "$val" && "$val" != "0" ]]; then + # Compare numerically; bash can do integer compare via [[ ]] + int_val="${val%.*}" + if (( int_val > max )); then max=$int_val; fi + fi + sleep 0.25 + done + echo "$max" +} + +for rate in "${RATES[@]}"; do + # Launch simulator in background. In cross-tier mode it drives both T1 + # and T3 on the same connection; otherwise just T1. + sim_args=( + --profile single + --sensor-type generic + --rate-hz "$rate" + --count 0 + --devices "$DEVICES" + ) + if [[ "$CROSS_TIER" == "1" ]]; then + sim_args+=(--t3-rate-hz "$T3_RATE_HZ" --t3-timeout-ms "$T3_TIMEOUT_MS") + fi + RUST_LOG=warn "$SIMULATOR" "${sim_args[@]}" >"$LOG_DIR/sim_${rate}.log" 2>&1 & + SIM_PID=$! + + # Warmup, then snapshot counters at the start of the *measurement* window. + sleep "$WARMUP_S" + snapshot_to "$BEFORE" + rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t1"\}') + drop_before=$(get_value "$BEFORE" 'substrate_dropped_total\{tier="t1"\}') + t3_rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t3"\}') + t3_nh_before=$(get_value "$BEFORE" 'substrate_t3_no_handler_total') + + depth_max=$(peak_depth t1) + + snapshot_to "$AFTER" + kill -TERM "$SIM_PID" 2>/dev/null || true + wait "$SIM_PID" 2>/dev/null || true + SIM_PID="" + + rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t1"\}') + drop_after=$(get_value "$AFTER" 'substrate_dropped_total\{tier="t1"\}') + p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.5"\}') + p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.99"\}') + p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.999"\}') + + t3_rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t3"\}') + t3_nh_after=$(get_value "$AFTER" 'substrate_t3_no_handler_total') + t3_p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.5"\}') + t3_p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.99"\}') + t3_p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.999"\}') + + tick_hz=$(get_value "$AFTER" 'substrate_tick_hz') + rss=$(get_value "$AFTER" 'substrate_rss_bytes') + + # Compute deltas + format. Use awk for floating math. + received=$(awk -v a="$rec_after" -v b="$rec_before" 'BEGIN { printf "%d", a-b }') + dropped=$(awk -v a="$drop_after" -v b="$drop_before" 'BEGIN { printf "%d", a-b }') + t3_received=$(awk -v a="$t3_rec_after" -v b="$t3_rec_before" 'BEGIN { printf "%d", a-b }') + t3_no_handler=$(awk -v a="$t3_nh_after" -v b="$t3_nh_before" 'BEGIN { printf "%d", a-b }') + rss_mb=$(awk -v r="$rss" 'BEGIN { printf "%.1f", r/1048576 }') + tick_hz_fmt=$(awk -v t="$tick_hz" 'BEGIN { printf "%.1f", t }') + + if [[ "$CROSS_TIER" == "1" ]]; then + printf '%-8s %-9s %-9s %-10.0f %-10.0f %-8s %-9.0f %-10.0f %-10.0f %-8s %-7s\n' \ + "$rate" "$received" "$dropped" \ + "${p50:-0}" "${p99:-0}" \ + "$t3_received" "${t3_p50:-0}" "${t3_p99:-0}" "${t3_p999:-0}" \ + "$tick_hz_fmt" "$rss_mb" + else + printf '%-8s %-9s %-9s %-10.0f %-10.0f %-10.0f %-8s %-7s\n' \ + "$rate" "$received" "$dropped" "${p50:-0}" "${p99:-0}" "${p999:-0}" \ + "$tick_hz_fmt" "$rss_mb" + fi + + echo "$rate,$T3_RATE_HZ,$DEVICES,$TICK_RATE_HZ,$WINDOW_S,$received,$dropped,${p50:-0},${p99:-0},${p999:-0},$t3_received,$t3_no_handler,${t3_p50:-0},${t3_p99:-0},${t3_p999:-0},$tick_hz_fmt,$rss_mb,$depth_max" >> "$OUT_CSV" + + # Tiny breather between rate points so the substrate's summary window + # doesn't carry over. + sleep 1 +done + +printf '\n%sCSV written to:%s %s\n' "$DIM" "$RESET" "$OUT_CSV" +printf '%sSubstrate log:%s %s\n' "$DIM" "$RESET" "$SUB_LOG" diff --git a/scripts/demo.sh b/scripts/demo.sh new file mode 100755 index 0000000..52c5aab --- /dev/null +++ b/scripts/demo.sh @@ -0,0 +1,222 @@ +#!/usr/bin/env bash +# scripts/demo.sh — bring the whole stack up: certs → build → VM+Grafana → +# substrate → simulator. Tails simulator progress in the foreground. Ctrl-C +# cleans everything up. +# +# Overridable via env vars: +# PROFILE single | industrial (default: industrial) +# RATE_HZ T1 datagram rate (default: 500) +# T2_RATE_HZ T2 uni stream rate (default: 5) +# T3_RATE_HZ T3 bi stream rate (default: 2) +# DEVICES number of devices (default: 5) +# BUILD release | debug (default: release) +# KEEP_MONITORING if 1, don't `docker compose down` on exit (default: 0) +# +# Example: +# ./scripts/demo.sh +# PROFILE=single RATE_HZ=100 DEVICES=20 ./scripts/demo.sh +# KEEP_MONITORING=1 ./scripts/demo.sh + +set -euo pipefail + +# --- locate repo root --- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$ROOT" + +# --- defaults --- +PROFILE="${PROFILE:-industrial}" +RATE_HZ="${RATE_HZ:-500}" +T2_RATE_HZ="${T2_RATE_HZ:-5}" +T3_RATE_HZ="${T3_RATE_HZ:-2}" +DEVICES="${DEVICES:-5}" +BUILD="${BUILD:-release}" +KEEP_MONITORING="${KEEP_MONITORING:-0}" +LOG_DIR="${LOG_DIR:-/tmp/quic_ecs_dt}" + +# --- pretty logging --- +if [[ -t 1 ]]; then + BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m' + YELLOW=$'\033[33m'; RED=$'\033[31m'; CYAN=$'\033[36m'; RESET=$'\033[0m' +else + BOLD=; DIM=; GREEN=; YELLOW=; RED=; CYAN=; RESET= +fi +step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; } +ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; } +warn() { printf '%s ! %s%s\n' "$YELLOW" "$1" "$RESET"; } +fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; } + +# --- prereq check --- +step "Checking prerequisites" +for cmd in cargo docker openssl curl lsof; do + if ! command -v "$cmd" >/dev/null 2>&1; then + fail "missing required command: $cmd" + exit 1 + fi +done +if ! docker compose version >/dev/null 2>&1; then + fail "docker compose plugin not available (try 'docker compose version')" + exit 1 +fi +ok "cargo, docker, openssl, curl, lsof present" + +# --- port collision check (substrate runs on 9000 udp + 9100 tcp) --- +for port in 9000 9100; do + if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then + fail "port $port appears to be in use — another substrate or process is running" + lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | head -5 + exit 1 + fi +done +ok "ports 9000 (QUIC) and 9100 (/metrics) are free" + +# --- certs --- +step "Ensuring dev TLS cert exists" +if [[ ! -f certs/server.crt || ! -f certs/server.key ]]; then + make certs >/dev/null + ok "generated certs/server.{crt,key}" +else + ok "certs/server.{crt,key} already present" +fi + +# --- build --- +step "Building substrate + simulator ($BUILD profile)" +if [[ "$BUILD" == "release" ]]; then + cargo build --release -p substrate -p simulator + SUBSTRATE_BIN="$ROOT/target/release/substrate" + SIMULATOR_BIN="$ROOT/target/release/simulator" +else + cargo build -p substrate -p simulator + SUBSTRATE_BIN="$ROOT/target/debug/substrate" + SIMULATOR_BIN="$ROOT/target/debug/simulator" +fi +ok "binaries: $SUBSTRATE_BIN, $SIMULATOR_BIN" + +# --- monitoring --- +step "Bringing up VictoriaMetrics + Grafana (docker compose)" +docker compose -f monitoring/docker-compose.yml up -d >/dev/null +ok "containers started" + +printf '%s ⏳ waiting for VictoriaMetrics on :8428' "$DIM" +for i in $(seq 1 40); do + if curl -sf http://localhost:8428/health >/dev/null 2>&1; then + printf ' ready%s\n' "$RESET"; break + fi + printf '.'; sleep 0.5 + if [[ $i -eq 40 ]]; then printf ' TIMEOUT%s\n' "$RESET"; exit 1; fi +done + +printf '%s ⏳ waiting for Grafana on :3000' "$DIM" +for i in $(seq 1 40); do + if curl -sf http://localhost:3000/api/health >/dev/null 2>&1; then + printf ' ready%s\n' "$RESET"; break + fi + printf '.'; sleep 0.5 + if [[ $i -eq 40 ]]; then printf ' TIMEOUT%s\n' "$RESET"; exit 1; fi +done + +# --- substrate --- +mkdir -p "$LOG_DIR" +SUB_LOG="$LOG_DIR/substrate.log" +SIM_LOG="$LOG_DIR/simulator.log" +: >"$SUB_LOG" +: >"$SIM_LOG" + +step "Starting substrate (log: $SUB_LOG)" +RUST_LOG=info "$SUBSTRATE_BIN" >"$SUB_LOG" 2>&1 & +SUBSTRATE_PID=$! + +printf '%s ⏳ waiting for /metrics on :9100' "$DIM" +for i in $(seq 1 40); do + if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then + printf ' ready%s\n' "$RESET"; break + fi + printf '.'; sleep 0.25 + if [[ $i -eq 40 ]]; then + printf ' TIMEOUT%s\n' "$RESET" + warn "substrate failed to start; tail of $SUB_LOG:" + tail -30 "$SUB_LOG" + kill "$SUBSTRATE_PID" 2>/dev/null || true + exit 1 + fi +done + +# --- simulator --- +TOTAL_SLOTS=$DEVICES +if [[ "$PROFILE" == "industrial" ]]; then + TOTAL_SLOTS=$((DEVICES * 5)) +fi + +step "Starting simulator (log: $SIM_LOG)" +RUST_LOG=info "$SIMULATOR_BIN" \ + --profile "$PROFILE" \ + --rate-hz "$RATE_HZ" \ + --t2-rate-hz "$T2_RATE_HZ" \ + --t3-rate-hz "$T3_RATE_HZ" \ + --count 0 \ + --devices "$DEVICES" \ + >"$SIM_LOG" 2>&1 & +SIMULATOR_PID=$! +sleep 0.5 +if ! kill -0 "$SIMULATOR_PID" 2>/dev/null; then + fail "simulator exited immediately; tail of $SIM_LOG:" + tail -20 "$SIM_LOG" + kill "$SUBSTRATE_PID" 2>/dev/null || true + exit 1 +fi +ok "simulator PID $SIMULATOR_PID" + +# --- cleanup trap --- +cleanup() { + printf '\n%s» Cleaning up%s\n' "$BOLD" "$RESET" + if [[ -n "${SIMULATOR_PID:-}" ]]; then + kill -TERM "$SIMULATOR_PID" 2>/dev/null || true + wait "$SIMULATOR_PID" 2>/dev/null || true + ok "simulator stopped" + fi + if [[ -n "${SUBSTRATE_PID:-}" ]]; then + kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true + wait "$SUBSTRATE_PID" 2>/dev/null || true + ok "substrate stopped" + fi + if [[ "$KEEP_MONITORING" == "1" ]]; then + warn "leaving monitoring stack up (KEEP_MONITORING=1) — 'make monitoring-down' to stop" + else + docker compose -f monitoring/docker-compose.yml down >/dev/null 2>&1 || true + ok "monitoring stack stopped" + fi + printf '%sLogs preserved at:%s %s\n' "$DIM" "$RESET" "$LOG_DIR" +} +trap cleanup EXIT INT TERM + +# --- summary --- +cat <, + ) -> anyhow::Result { + let cert_path = cert_path.as_ref(); + let cert_pem = std::fs::read(cert_path) + .with_context(|| format!("read trust cert at {}", cert_path.display()))?; + + let parsed: Vec> = rustls_pemfile::certs(&mut cert_pem.as_slice()) + .collect::>() + .with_context(|| format!("parse PEM certs at {}", cert_path.display()))?; + let expected = parsed + .into_iter() + .next() + .ok_or_else(|| anyhow!("no certificates found in {}", cert_path.display()))?; + + // Reuse the process-wide rustls provider that `install_crypto_provider` + // (or substrate's main) already installed. Failing to find one here + // means nobody installed a default — caller error. + let provider = rustls::crypto::CryptoProvider::get_default() + .ok_or_else(|| anyhow!("no rustls default crypto provider installed"))? + .clone(); + + let verifier = Arc::new(TrustExactCert { + expected, + provider: provider.clone(), + }); + + let rustls_cfg = rustls::ClientConfig::builder_with_provider(provider) + .with_safe_default_protocol_versions() + .context("rustls client builder")? + .dangerous() + .with_custom_certificate_verifier(verifier) + .with_no_client_auth(); + + let quic_cfg = quinn::crypto::rustls::QuicClientConfig::try_from(rustls_cfg) + .context("wrap rustls config for QUIC")?; + let client_cfg = ClientConfig::new(Arc::new(quic_cfg)); + + let bind: SocketAddr = if server_addr.is_ipv6() { + "[::]:0".parse().unwrap() + } else { + "0.0.0.0:0".parse().unwrap() + }; + let mut endpoint = Endpoint::client(bind).context("Endpoint::client bind")?; + endpoint.set_default_client_config(client_cfg); + + let connecting = endpoint + .connect(server_addr, server_name) + .with_context(|| format!("client connect to {server_addr} as {server_name}"))?; + let conn = connecting.await.context("client TLS handshake")?; + + tracing::info!(remote = %conn.remote_address(), "simulator client connected"); + Ok(Self { endpoint, conn }) + } + + /// T1 — send one `QuicMessage` over a QUIC datagram (38 B fixed). + pub fn send_datagram(&self, msg: &QuicMessage) -> anyhow::Result<()> { + let bytes = bytes::Bytes::copy_from_slice(&msg.to_bytes()); + self.conn.send_datagram(bytes).context("send_datagram")?; + Ok(()) + } + + /// T2 — open a unidirectional stream, write each message as 38 B back-to-back, + /// then `finish()` the stream. The substrate sees one or many events per + /// stream, ordered within the stream. + pub async fn send_uni_stream(&self, msgs: &[QuicMessage]) -> anyhow::Result<()> { + let mut send = self.conn.open_uni().await.context("open_uni")?; + for msg in msgs { + send.write_all(&msg.to_bytes()) + .await + .context("write QuicMessage to uni stream")?; + } + send.finish().context("finish uni stream")?; + Ok(()) + } + + /// T3 — open a bidirectional stream, write the command (38 B), finish the + /// send half, then read the substrate's ack (38 B). Errors if the + /// substrate resets the stream (e.g. no handler installed yet) or if the + /// connection drops mid-exchange. + pub async fn request(&self, command: &QuicMessage) -> anyhow::Result { + let (mut send, mut recv) = self.conn.open_bi().await.context("open_bi")?; + send.write_all(&command.to_bytes()) + .await + .context("write T3 command")?; + send.finish().context("finish T3 send half")?; + + let mut buf = [0u8; QuicMessage::WIRE_SIZE]; + recv.read_exact(&mut buf) + .await + .context("read T3 ack")?; + let ack = QuicMessage::decode(&buf).context("decode T3 ack")?; + Ok(ack) + } + + /// Close the connection gracefully. Use before dropping in tests so the + /// peer's `conn.closed()` resolves cleanly instead of via timeout. + pub async fn close(&self) { + self.conn.close(0u32.into(), b"client done"); + self.endpoint.wait_idle().await; + } +} + +/// `ServerCertVerifier` that accepts exactly one specific cert by byte +/// equality. Signature verification still runs through the default provider — +/// only the chain-validity check is replaced. +#[derive(Debug)] +struct TrustExactCert { + expected: CertificateDer<'static>, + provider: Arc, +} + +impl ServerCertVerifier for TrustExactCert { + fn verify_server_cert( + &self, + end_entity: &CertificateDer<'_>, + _intermediates: &[CertificateDer<'_>], + _server_name: &ServerName<'_>, + _ocsp_response: &[u8], + _now: UnixTime, + ) -> Result { + if end_entity.as_ref() == self.expected.as_ref() { + Ok(ServerCertVerified::assertion()) + } else { + Err(rustls::Error::General( + "server cert does not match trusted dev cert".into(), + )) + } + } + + fn verify_tls12_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + rustls::crypto::verify_tls12_signature( + message, + cert, + dss, + &self.provider.signature_verification_algorithms, + ) + } + + fn verify_tls13_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + rustls::crypto::verify_tls13_signature( + message, + cert, + dss, + &self.provider.signature_verification_algorithms, + ) + } + + fn supported_verify_schemes(&self) -> Vec { + self.provider.signature_verification_algorithms.supported_schemes() + } +} diff --git a/simulator/src/emitters.rs b/simulator/src/emitters.rs new file mode 100644 index 0000000..861d5ec --- /dev/null +++ b/simulator/src/emitters.rs @@ -0,0 +1,147 @@ +//! Async emitter tasks for T2 (uni streams) and T3 (bi streams + ack). +//! +//! Each emitter ticks at its own rate, opens a fresh stream per event, and +//! shares a `Connection` with the rest of the simulator. T1 (datagrams) is +//! driven inline by the main loop so the foreground task owns the progress +//! reporting; the reliable tiers run as `tokio::spawn`ed background tasks. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use anyhow::Context; +use substrate::transport::QuicMessage; +use tokio::time::MissedTickBehavior; + +use crate::profile::{SensorSlot, generate_value}; + +/// UNIX-epoch microseconds — the wall-clock timestamp the simulator stamps +/// into every outgoing `QuicMessage`. Substrate-side latency is computed as +/// `substrate_now_us - msg.timestamp_us`, so this needs to be a real wall +/// clock both ends share (NTP for two-machine; loopback otherwise). +pub fn now_us() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_micros() as u64) + .unwrap_or(0) +} + +/// T2 emitter — opens a fresh uni stream per event, writes one +/// `QuicMessage`, and `finish`es. Returns the count of events successfully +/// delivered when `interrupted` is raised. +pub async fn run_t2_emitter( + conn: quinn::Connection, + mut slot: SensorSlot, + rate_hz: f64, + interrupted: Arc, + counter: Arc, +) -> u64 { + let period = Duration::from_nanos((1.0e9 / rate_hz) as u64); + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + let mut sent: u64 = 0; + + loop { + ticker.tick().await; + if interrupted.load(Ordering::SeqCst) { + break; + } + + let msg = QuicMessage { + device_id: slot.device_id, + sensor_id: slot.sensor_id, + raw_value: generate_value(slot.sensor_type, slot.seq), + timestamp_us: now_us(), + sequence_number: slot.seq, + sensor_type: slot.sensor_type.as_u8(), + }; + slot.seq = slot.seq.wrapping_add(1); + + match conn.open_uni().await { + Ok(mut send) => { + if let Err(e) = send.write_all(&msg.to_bytes()).await { + tracing::warn!(error = %e, "T2 write_all failed"); + continue; + } + if let Err(e) = send.finish() { + tracing::warn!(error = %e, "T2 finish failed"); + continue; + } + sent += 1; + counter.store(sent, Ordering::Relaxed); + } + Err(e) => { + tracing::warn!(error = %e, "T2 open_uni failed; emitter exiting"); + break; + } + } + } + sent +} + +/// T3 emitter — opens a fresh bi-stream per command, writes the command, +/// awaits the ack with a bounded timeout. Returns `(acks_received, timeouts)`. +pub async fn run_t3_emitter( + conn: quinn::Connection, + mut slot: SensorSlot, + rate_hz: f64, + timeout: Duration, + interrupted: Arc, + sent_counter: Arc, + timeout_counter: Arc, +) -> (u64, u64) { + let period = Duration::from_nanos((1.0e9 / rate_hz) as u64); + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + let mut sent: u64 = 0; + let mut timeouts: u64 = 0; + + loop { + ticker.tick().await; + if interrupted.load(Ordering::SeqCst) { + break; + } + + let cmd = QuicMessage { + device_id: slot.device_id, + sensor_id: slot.sensor_id, + raw_value: generate_value(slot.sensor_type, slot.seq), + timestamp_us: now_us(), + sequence_number: slot.seq, + sensor_type: slot.sensor_type.as_u8(), + }; + slot.seq = slot.seq.wrapping_add(1); + + match tokio::time::timeout(timeout, t3_one_request(&conn, &cmd)).await { + Ok(Ok(_ack)) => { + sent += 1; + sent_counter.store(sent, Ordering::Relaxed); + } + Ok(Err(e)) => { + tracing::warn!(error = %e, "T3 request failed"); + } + Err(_) => { + timeouts += 1; + timeout_counter.store(timeouts, Ordering::Relaxed); + tracing::warn!(?timeout, "T3 ack timed out"); + } + } + } + (sent, timeouts) +} + +/// Single T3 round-trip: open bi-stream, write 38 B command, `finish` the +/// send half, read 38 B ack. Used by `run_t3_emitter`. +async fn t3_one_request( + conn: &quinn::Connection, + cmd: &QuicMessage, +) -> anyhow::Result { + let (mut send, mut recv) = conn.open_bi().await.context("T3 open_bi")?; + send.write_all(&cmd.to_bytes()) + .await + .context("T3 write command")?; + send.finish().context("T3 finish send half")?; + let mut buf = [0u8; QuicMessage::WIRE_SIZE]; + recv.read_exact(&mut buf).await.context("T3 read ack")?; + QuicMessage::decode(&buf).context("T3 decode ack") +} diff --git a/simulator/src/lib.rs b/simulator/src/lib.rs new file mode 100644 index 0000000..af27f96 --- /dev/null +++ b/simulator/src/lib.rs @@ -0,0 +1,12 @@ +pub mod client; +pub mod emitters; +pub mod profile; + +/// Install rustls' default crypto provider. Idempotent: safe to call from +/// every test, every binary entry, and the substrate process. The `aws_lc_rs` +/// provider matches what the substrate installs in `main.rs`. +pub fn install_crypto_provider() { + // Returns Err if a provider is already installed; that's the expected + // case in any process that's already booted substrate or a sibling test. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); +} diff --git a/simulator/src/main.rs b/simulator/src/main.rs index e7a11a9..0a232ec 100644 --- a/simulator/src/main.rs +++ b/simulator/src/main.rs @@ -1,3 +1,320 @@ -fn main() { - println!("Hello, world!"); +//! Manual smoke runner / load driver for the substrate. +//! +//! Parses the CLI, builds the per-device sensor layout, then drives T1 +//! datagrams in the foreground while T2 and T3 emitters run as background +//! tokio tasks. Helpers live in the simulator library: +//! +//! - `simulator::profile` — `SensorProfile`, `SensorSlot`, waveform generator +//! - `simulator::emitters` — `run_t2_emitter`, `run_t3_emitter`, `now_us` +//! - `simulator::client` — Quinn client + TLS trust-by-cert verifier + +use std::net::SocketAddr; +use std::path::PathBuf; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::{Duration, Instant}; + +use anyhow::{Context, anyhow}; +use clap::{Parser, ValueEnum}; +use simulator::client::SimulatorClient; +use simulator::emitters::{now_us, run_t2_emitter, run_t3_emitter}; +use simulator::profile::{SensorProfile, build_slots, generate_value}; +use substrate::transport::{QuicMessage, SensorType}; +use tokio::time::MissedTickBehavior; +use tracing_subscriber::EnvFilter; + +#[derive(Parser, Debug)] +#[command(name = "simulator", about, long_about = None)] +struct Cli { + /// Substrate address (host:port). + #[arg(long, default_value = "127.0.0.1:9000")] + addr: SocketAddr, + + /// SNI name presented during the TLS handshake. + #[arg(long, default_value = "localhost")] + server_name: String, + + /// Path to the substrate's PEM cert; used as the exact-match trust anchor. + #[arg(long, default_value = "certs/server.crt")] + cert: PathBuf, + + /// Sensor mix per device. + /// + /// - `single` (default): one sensor per device of `--sensor-type`, on + /// `--sensor-id`. Lowest-cardinality, easiest to reason about. + /// - `industrial`: five sensors per device on ids 0..4 — Temperature, + /// Humidity, Pressure, Voltage, Current. Lights up every dashboard + /// panel. + #[arg(long, value_enum, default_value_t = SensorProfile::Single)] + profile: SensorProfile, + + /// Sensor type for the `single` profile. Ignored by `industrial`. + #[arg(long, value_enum, default_value_t = CliSensorType::Generic)] + sensor_type: CliSensorType, + + /// T1 datagram rate across all (device, sensor) slots (Hz). `0` disables T1. + #[arg(long, default_value_t = 20.0)] + rate_hz: f64, + + /// T2 uni-stream event rate (Hz). `0` disables T2 (default). + #[arg(long, default_value_t = 0.0)] + t2_rate_hz: f64, + + /// T3 bidirectional command rate (Hz). `0` disables T3 (default). + #[arg(long, default_value_t = 0.0)] + t3_rate_hz: f64, + + /// Per-command timeout for T3 ack waits (milliseconds). + #[arg(long, default_value_t = 2000)] + t3_timeout_ms: u64, + + /// Number of T1 datagrams to send. `0` runs until Ctrl-C. + #[arg(long, default_value_t = 10)] + count: u64, + + /// Number of distinct device UUIDs to round-robin. + #[arg(long, default_value_t = 1)] + devices: u32, + + /// Sensor index for the `single` profile. Ignored by `industrial`. + #[arg(long, default_value_t = 0)] + sensor_id: u16, +} + +#[derive(ValueEnum, Clone, Copy, Debug, Default)] +enum CliSensorType { + #[default] + Generic, + Temperature, + Humidity, + Pressure, + Voltage, + Current, +} + +impl From for SensorType { + fn from(c: CliSensorType) -> Self { + match c { + CliSensorType::Generic => SensorType::Generic, + CliSensorType::Temperature => SensorType::Temperature, + CliSensorType::Humidity => SensorType::Humidity, + CliSensorType::Pressure => SensorType::Pressure, + CliSensorType::Voltage => SensorType::Voltage, + CliSensorType::Current => SensorType::Current, + } + } +} + +fn validate(cli: &Cli) -> anyhow::Result<()> { + if cli.rate_hz < 0.0 { + return Err(anyhow!("--rate-hz must be >= 0")); + } + if cli.t2_rate_hz < 0.0 { + return Err(anyhow!("--t2-rate-hz must be >= 0")); + } + if cli.t3_rate_hz < 0.0 { + return Err(anyhow!("--t3-rate-hz must be >= 0")); + } + if cli.rate_hz == 0.0 && cli.t2_rate_hz == 0.0 && cli.t3_rate_hz == 0.0 { + return Err(anyhow!( + "at least one of --rate-hz / --t2-rate-hz / --t3-rate-hz must be > 0" + )); + } + if cli.devices == 0 { + return Err(anyhow!("--devices must be >= 1")); + } + Ok(()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + ) + .init(); + + let cli = Cli::parse(); + validate(&cli)?; + + simulator::install_crypto_provider(); + + let mut slots = build_slots( + cli.profile, + cli.devices, + cli.sensor_type.into(), + cli.sensor_id, + ); + + tracing::info!( + ?cli.addr, + rate_hz = cli.rate_hz, + t2_rate_hz = cli.t2_rate_hz, + t3_rate_hz = cli.t3_rate_hz, + count = cli.count, + devices = cli.devices, + slots = slots.len(), + profile = ?cli.profile, + "simulator launching" + ); + + let client = SimulatorClient::connect(cli.addr, &cli.server_name, &cli.cert) + .await + .context("connect to substrate")?; + + let interrupted = Arc::new(AtomicBool::new(false)); + { + let flag = interrupted.clone(); + tokio::spawn(async move { + let _ = tokio::signal::ctrl_c().await; + tracing::info!("Ctrl-C received, draining…"); + flag.store(true, Ordering::SeqCst); + }); + } + + // T2 / T3 emitters target slot[0] for their device/sensor identity. + let t2_slot = slots[0].clone(); + let t3_slot = slots[0].clone(); + + let t2_sent = Arc::new(AtomicU64::new(0)); + let t2_handle = if cli.t2_rate_hz > 0.0 { + let conn = client.conn.clone(); + let rate = cli.t2_rate_hz; + let interrupted = interrupted.clone(); + let counter = t2_sent.clone(); + Some(tokio::spawn(async move { + run_t2_emitter(conn, t2_slot, rate, interrupted, counter).await + })) + } else { + None + }; + + let t3_sent = Arc::new(AtomicU64::new(0)); + let t3_timeouts = Arc::new(AtomicU64::new(0)); + let t3_handle = if cli.t3_rate_hz > 0.0 { + let conn = client.conn.clone(); + let rate = cli.t3_rate_hz; + let timeout = Duration::from_millis(cli.t3_timeout_ms); + let interrupted = interrupted.clone(); + let sent_counter = t3_sent.clone(); + let to_counter = t3_timeouts.clone(); + Some(tokio::spawn(async move { + run_t3_emitter( + conn, + t3_slot, + rate, + timeout, + interrupted, + sent_counter, + to_counter, + ) + .await + })) + } else { + None + }; + + let started = Instant::now(); + let mut t1_sent: u64 = 0; + let mut send_errors: u64 = 0; + + if cli.rate_hz > 0.0 { + let period = Duration::from_nanos((1.0e9 / cli.rate_hz) as u64); + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + + let unlimited = cli.count == 0; + let mut last_progress = started; + + loop { + ticker.tick().await; + if interrupted.load(Ordering::SeqCst) { + break; + } + if !unlimited && t1_sent >= cli.count { + break; + } + + let slot_idx = (t1_sent as usize) % slots.len(); + let slot = &mut slots[slot_idx]; + let msg = QuicMessage { + device_id: slot.device_id, + sensor_id: slot.sensor_id, + raw_value: generate_value(slot.sensor_type, slot.seq), + timestamp_us: now_us(), + sequence_number: slot.seq, + sensor_type: slot.sensor_type.as_u8(), + }; + slot.seq = slot.seq.wrapping_add(1); + t1_sent += 1; + + if let Err(e) = client.send_datagram(&msg) { + send_errors += 1; + tracing::warn!(error = %e, "send_datagram failed"); + } + + let now = Instant::now(); + if now.duration_since(last_progress) >= Duration::from_secs(1) { + let elapsed = now.duration_since(started).as_secs_f64(); + let t1_hz = (t1_sent as f64) / elapsed.max(1e-9); + let t2_now = t2_sent.load(Ordering::Relaxed); + let t2_hz = (t2_now as f64) / elapsed.max(1e-9); + let t3_now = t3_sent.load(Ordering::Relaxed); + let t3_hz = (t3_now as f64) / elapsed.max(1e-9); + let t3_to = t3_timeouts.load(Ordering::Relaxed); + tracing::info!( + t1_sent, + t2_sent = t2_now, + t3_sent = t3_now, + t3_timeouts = t3_to, + send_errors, + t1_hz = format_args!("{:.1}", t1_hz), + t2_hz = format_args!("{:.1}", t2_hz), + t3_hz = format_args!("{:.1}", t3_hz), + "progress" + ); + last_progress = now; + } + } + } else { + while !interrupted.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + + interrupted.store(true, Ordering::SeqCst); + let t2_total: u64 = match t2_handle { + Some(h) => h.await.unwrap_or_else(|e| { + tracing::warn!(error = %e, "T2 emitter task ended unexpectedly"); + 0 + }), + None => 0, + }; + let (t3_total, t3_timeouts_total): (u64, u64) = match t3_handle { + Some(h) => h.await.unwrap_or_else(|e| { + tracing::warn!(error = %e, "T3 emitter task ended unexpectedly"); + (0, 0) + }), + None => (0, 0), + }; + + let elapsed = started.elapsed().as_secs_f64(); + let t1_hz = (t1_sent as f64) / elapsed.max(1e-9); + let t2_hz = (t2_total as f64) / elapsed.max(1e-9); + let t3_hz = (t3_total as f64) / elapsed.max(1e-9); + tracing::info!( + t1_sent, + t2_sent = t2_total, + t3_sent = t3_total, + t3_timeouts = t3_timeouts_total, + send_errors, + elapsed_s = format_args!("{:.3}", elapsed), + t1_observed_hz = format_args!("{:.1}", t1_hz), + t2_observed_hz = format_args!("{:.1}", t2_hz), + t3_observed_hz = format_args!("{:.1}", t3_hz), + "simulator done" + ); + + client.close().await; + Ok(()) } diff --git a/simulator/src/profile.rs b/simulator/src/profile.rs new file mode 100644 index 0000000..5503ea3 --- /dev/null +++ b/simulator/src/profile.rs @@ -0,0 +1,88 @@ +//! Per-device sensor layout (the `--profile` CLI flag's runtime form) and the +//! type-appropriate waveform generators that feed the substrate's Grafana +//! dashboard with believable numbers. + +use clap::ValueEnum; +use substrate::transport::SensorType; +use uuid::Uuid; + +/// Per-device sensor layout selected by the `--profile` CLI flag. +/// +/// - `Single`: one sensor per device of a chosen `SensorType`. Lowest +/// cardinality; the right pick for throughput / latency benchmarks. +/// - `Industrial`: five sensors per device on ids 0..4 — Temperature, +/// Humidity, Pressure, Voltage, Current. Lights up every sensor-type +/// panel in the operator dashboard. +#[derive(ValueEnum, Clone, Copy, Debug)] +pub enum SensorProfile { + Single, + Industrial, +} + +/// A single emitter slot: the `(device, sensor, type)` triple plus the +/// per-slot monotonic sequence counter that the simulator advances on every +/// outgoing message. +#[derive(Clone, Debug)] +pub struct SensorSlot { + pub device_id: Uuid, + pub sensor_id: u16, + pub sensor_type: SensorType, + pub seq: u32, +} + +/// Expand a `(profile, num_devices)` choice into the flat list of slots +/// the T1 emitter rotates through. Each device gets a fresh UUID. +pub fn build_slots( + profile: SensorProfile, + num_devices: u32, + default_type: SensorType, + default_sensor_id: u16, +) -> Vec { + let mut slots = Vec::new(); + for _ in 0..num_devices { + let device_id = Uuid::new_v4(); + match profile { + SensorProfile::Single => { + slots.push(SensorSlot { + device_id, + sensor_id: default_sensor_id, + sensor_type: default_type, + seq: 0, + }); + } + SensorProfile::Industrial => { + for (sensor_id, sensor_type) in [ + (0u16, SensorType::Temperature), + (1, SensorType::Humidity), + (2, SensorType::Pressure), + (3, SensorType::Voltage), + (4, SensorType::Current), + ] { + slots.push(SensorSlot { + device_id, + sensor_id, + sensor_type, + seq: 0, + }); + } + } + } + } + slots +} + +/// Type-appropriate waveform so the dashboard has something believable to +/// render. `seq` is the sample index — multiplying by 0.05 gives a +/// "seconds-like" wall-clock pacing inside the trig functions regardless of +/// the actual send rate, so panels animate over the same visible period. +pub fn generate_value(t: SensorType, seq: u32) -> f64 { + let t_phase = (seq as f64) * 0.05; + match t { + SensorType::Temperature => 20.0 + 5.0 * (t_phase / 10.0).sin(), + SensorType::Humidity => 50.0 + 20.0 * (t_phase / 15.0).sin(), + SensorType::Pressure => 1013.0 + 5.0 * (t_phase / 20.0).cos(), + SensorType::Voltage => 230.0 + 0.5 * (t_phase / 3.0).sin(), + SensorType::Current => 10.0 + 2.0 * (t_phase / 5.0).cos(), + SensorType::Generic => t_phase.sin(), + } +} diff --git a/simulator/tests/end_to_end_t1.rs b/simulator/tests/end_to_end_t1.rs new file mode 100644 index 0000000..c64209f --- /dev/null +++ b/simulator/tests/end_to_end_t1.rs @@ -0,0 +1,139 @@ +//! End-to-end T1 datagram test: spin up substrate's listener in-process with +//! channels the test owns, drive a `SimulatorClient` against it, and assert +//! the datagram lands in the T1 receiver decoded. +//! +//! Run with `cargo test -p simulator`. + +use std::net::SocketAddr; +use std::path::PathBuf; +use std::time::Duration; + +use anyhow::Result; +use simulator::client::SimulatorClient; +use substrate::config::QuicConfig; +use substrate::transport::server::{accept_loop, bind_endpoint}; +use substrate::transport::{QuicMessage, SensorType, T1Sender, T2Sender, T3Sender}; +use tokio::sync::mpsc; +use uuid::Uuid; + +fn cert_path(name: &str) -> PathBuf { + [env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect() +} + +fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig { + QuicConfig { + // Port 0 lets the OS pick a free ephemeral port — tests can run in + // parallel without colliding on a fixed bind. + server_port: 0, + server_interface: "127.0.0.1".to_string(), + server_cert: cert.to_string_lossy().into_owned(), + server_key: key.to_string_lossy().into_owned(), + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn t1_datagram_decoded_into_ecs_channel() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + // Bind the substrate's listener on an ephemeral port. + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + // Channels the test owns — gives us direct visibility into what the T1 + // demux pushes into the ECS bridge. + let (t1_tx, mut t1_rx) = mpsc::channel(64); + let (t2_tx, _t2_rx) = mpsc::channel(64); + let (t3_tx, _t3_rx) = mpsc::channel(64); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + // Connect a client and send one datagram. + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + let sent = QuicMessage { + device_id: Uuid::from_u128(0xdead_beef_cafe_f00d_1234_5678_90ab_cdef), + sensor_id: 7, + raw_value: 42.0, + timestamp_us: 1_700_000_000_000_001, + sequence_number: 1, + sensor_type: SensorType::Temperature.as_u8(), + }; + client.send_datagram(&sent)?; + + // Wait for the substrate's read_datagrams reader to push it into T1. + let received = tokio::time::timeout(Duration::from_secs(2), t1_rx.recv()) + .await + .expect("did not observe T1 datagram within 2s") + .expect("T1 channel closed unexpectedly"); + + assert_eq!(received, sent); + + client.close().await; + server_task.abort(); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn t1_burst_preserves_order_and_count() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + // T1 capacity 64 ≥ burst size 32 so nothing is dropped under loopback. + let (t1_tx, mut t1_rx) = mpsc::channel(64); + let (t2_tx, _t2_rx) = mpsc::channel(8); + let (t3_tx, _t3_rx) = mpsc::channel(8); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + let device = Uuid::from_u128(0xa1a2_a3a4_b5b6_b7b8_c9ca_cbcc_cdce_cfd0); + const BURST: u32 = 32; + for seq in 0..BURST { + let msg = QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: f64::from(seq), + timestamp_us: 1_700_000_000_000_000 + u64::from(seq), + sequence_number: seq, + sensor_type: SensorType::Generic.as_u8(), + }; + client.send_datagram(&msg)?; + } + + // Drain BURST messages with a per-message timeout. Loopback shouldn't + // reorder QUIC datagrams within a single connection. + for expected_seq in 0..BURST { + let msg = tokio::time::timeout(Duration::from_secs(2), t1_rx.recv()) + .await + .unwrap_or_else(|_| panic!("missed datagram seq={expected_seq}")) + .expect("T1 channel closed"); + assert_eq!(msg.sequence_number, expected_seq); + assert_eq!(msg.device_id, device); + assert_eq!(msg.raw_value, f64::from(expected_seq)); + } + + client.close().await; + server_task.abort(); + Ok(()) +} diff --git a/simulator/tests/end_to_end_t2.rs b/simulator/tests/end_to_end_t2.rs new file mode 100644 index 0000000..881dabe --- /dev/null +++ b/simulator/tests/end_to_end_t2.rs @@ -0,0 +1,163 @@ +//! End-to-end T2 (unidirectional stream) tests. Mirrors the T1 harness: +//! spin up substrate's listener with channels owned by the test, drive a +//! `SimulatorClient` against it, assert what arrives on the T2 receiver. +//! +//! Run with `cargo test -p simulator`. + +use std::collections::HashMap; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::time::Duration; + +use anyhow::Result; +use simulator::client::SimulatorClient; +use substrate::config::QuicConfig; +use substrate::transport::server::{accept_loop, bind_endpoint}; +use substrate::transport::{QuicMessage, SensorType, T1Sender, T2Sender, T3Sender}; +use tokio::sync::mpsc; +use uuid::Uuid; + +fn cert_path(name: &str) -> PathBuf { + [env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect() +} + +fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig { + QuicConfig { + server_port: 0, + server_interface: "127.0.0.1".to_string(), + server_cert: cert.to_string_lossy().into_owned(), + server_key: key.to_string_lossy().into_owned(), + } +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn t2_single_stream_preserves_order() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + let (t1_tx, _t1_rx) = mpsc::channel(64); + let (t2_tx, mut t2_rx) = mpsc::channel(64); + let (t3_tx, _t3_rx) = mpsc::channel(64); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + let device = Uuid::from_u128(0x0011_2233_4455_6677_8899_aabb_ccdd_eeff); + const N: u32 = 10; + let msgs: Vec = (0..N) + .map(|i| QuicMessage { + device_id: device, + sensor_id: 1, + raw_value: f64::from(i), + timestamp_us: 1_700_000_000_000_000 + u64::from(i), + sequence_number: i, + sensor_type: SensorType::Pressure.as_u8(), + }) + .collect(); + + client.send_uni_stream(&msgs).await?; + + for expected in &msgs { + let received = tokio::time::timeout(Duration::from_secs(2), t2_rx.recv()) + .await + .expect("missed T2 message") + .expect("T2 channel closed unexpectedly"); + assert_eq!(received, *expected); + } + + client.close().await; + server_task.abort(); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn t2_concurrent_streams_each_internally_ordered() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + let (t1_tx, _t1_rx) = mpsc::channel(64); + let (t2_tx, mut t2_rx) = mpsc::channel(256); + let (t3_tx, _t3_rx) = mpsc::channel(64); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + // 4 devices × 8 messages each on independent uni streams. Cross-stream + // ordering may interleave; per-stream ordering must be strict. + const DEVICES: usize = 4; + const PER_DEVICE: u32 = 8; + let device_ids: Vec = (0..DEVICES).map(|_| Uuid::new_v4()).collect(); + + let mut handles = Vec::with_capacity(DEVICES); + for &device in &device_ids { + let conn = client.conn.clone(); + handles.push(tokio::spawn(async move { + let msgs: Vec = (0..PER_DEVICE) + .map(|i| QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: f64::from(i), + timestamp_us: 1_700_000_000_000_000 + u64::from(i), + sequence_number: i, + sensor_type: SensorType::Generic.as_u8(), + }) + .collect(); + // Use the connection directly so each task owns its own stream + // — same wire pattern as `SimulatorClient::send_uni_stream`. + let mut send = conn.open_uni().await.expect("open_uni"); + for m in &msgs { + send.write_all(&m.to_bytes()).await.expect("write_all"); + } + send.finish().expect("finish"); + })); + } + for h in handles { + h.await?; + } + + // Drain DEVICES × PER_DEVICE messages, group by device, assert per-device + // sequence numbers are strictly increasing from 0. + let total = DEVICES * PER_DEVICE as usize; + let mut by_device: HashMap> = HashMap::new(); + for _ in 0..total { + let msg = tokio::time::timeout(Duration::from_secs(2), t2_rx.recv()) + .await + .expect("missed T2 message") + .expect("T2 channel closed unexpectedly"); + by_device.entry(msg.device_id).or_default().push(msg.sequence_number); + } + + assert_eq!(by_device.len(), DEVICES, "expected one entry per device"); + for (dev, seqs) in &by_device { + let expected: Vec = (0..PER_DEVICE).collect(); + assert_eq!(seqs, &expected, "out-of-order or missing sequence for {dev}"); + } + + client.close().await; + server_task.abort(); + Ok(()) +} diff --git a/simulator/tests/end_to_end_t3.rs b/simulator/tests/end_to_end_t3.rs new file mode 100644 index 0000000..d86e6e0 --- /dev/null +++ b/simulator/tests/end_to_end_t3.rs @@ -0,0 +1,152 @@ +//! End-to-end T3 (bidirectional stream + oneshot ack) tests. Same shape as +//! the T1/T2 harnesses: spin up substrate's listener with channels owned by +//! the test, run a "fake ECS" task that drains the T3 receiver and either +//! replies or drops the oneshot, and assert the client observes the right +//! behaviour. +//! +//! Run with `cargo test -p simulator`. + +use std::net::SocketAddr; +use std::path::PathBuf; +use std::time::Duration; + +use anyhow::Result; +use simulator::client::SimulatorClient; +use substrate::config::QuicConfig; +use substrate::transport::server::{accept_loop, bind_endpoint}; +use substrate::transport::{QuicMessage, SensorType, T1Sender, T2Sender, T3Sender}; +use tokio::sync::mpsc; +use uuid::Uuid; + +fn cert_path(name: &str) -> PathBuf { + [env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect() +} + +fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig { + QuicConfig { + server_port: 0, + server_interface: "127.0.0.1".to_string(), + server_cert: cert.to_string_lossy().into_owned(), + server_key: key.to_string_lossy().into_owned(), + } +} + +/// Marker `timestamp_us` the fake ECS stamps onto every ack so the test can +/// distinguish a real reply from any echo of the command's own timestamp. +const ACK_MARKER_TS: u64 = 999_999_999_999; + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn t3_round_trip_with_fake_handler() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + let (t1_tx, _t1_rx) = mpsc::channel(64); + let (t2_tx, _t2_rx) = mpsc::channel(64); + let (t3_tx, mut t3_rx) = mpsc::channel(64); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + // Fake ECS handler: drain T3 inbounds, mark the timestamp, send back. + let handler = tokio::spawn(async move { + while let Some(inbound) = t3_rx.recv().await { + let mut ack = inbound.command; + ack.timestamp_us = ACK_MARKER_TS; + // Ignore send error (client may have disconnected before listening). + let _ = inbound.reply.send(ack); + } + }); + + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + let cmd = QuicMessage { + device_id: Uuid::from_u128(0xa5a5_a5a5_5a5a_5a5a_a5a5_5a5a_a5a5_5a5a), + sensor_id: 3, + raw_value: 1.5, + timestamp_us: 1_700_000_000_000_000, + sequence_number: 7, + sensor_type: SensorType::Voltage.as_u8(), + }; + + let ack = tokio::time::timeout(Duration::from_secs(2), client.request(&cmd)) + .await + .expect("T3 ack timed out")?; + + assert_eq!(ack.device_id, cmd.device_id, "ack should preserve device_id"); + assert_eq!(ack.sensor_id, cmd.sensor_id, "ack should preserve sensor_id"); + assert_eq!( + ack.sequence_number, cmd.sequence_number, + "ack should preserve sequence_number for correlation" + ); + assert_eq!(ack.timestamp_us, ACK_MARKER_TS, "fake ECS should stamp the marker"); + + client.close().await; + handler.abort(); + server_task.abort(); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn t3_no_handler_resets_stream() -> Result<()> { + simulator::install_crypto_provider(); + + let cert = cert_path("server.crt"); + let key = cert_path("server.key"); + let cfg = loopback_config(cert.clone(), key); + + let endpoint = bind_endpoint(&cfg)?; + let server_addr: SocketAddr = endpoint.local_addr()?; + + let (t1_tx, _t1_rx) = mpsc::channel(64); + let (t2_tx, _t2_rx) = mpsc::channel(64); + let (t3_tx, mut t3_rx) = mpsc::channel(64); + + let server_task = tokio::spawn(accept_loop( + endpoint, + T1Sender::new(t1_tx), + T2Sender::new(t2_tx), + T3Sender::new(t3_tx), + )); + + // Fake ECS that *drops* every oneshot — simulates "no handler installed", + // which is the placeholder state in `ingest_system` until M4 lands. + let handler = tokio::spawn(async move { + while let Some(inbound) = t3_rx.recv().await { + drop(inbound); + } + }); + + let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?; + + let cmd = QuicMessage { + device_id: Uuid::new_v4(), + sensor_id: 0, + raw_value: 0.0, + timestamp_us: 0, + sequence_number: 0, + sensor_type: SensorType::Generic.as_u8(), + }; + + let result = tokio::time::timeout(Duration::from_secs(2), client.request(&cmd)).await; + let inner = result.expect("client.request should not hang when stream is reset"); + assert!( + inner.is_err(), + "expected request to fail when substrate resets the stream, got Ok({:?})", + inner.ok() + ); + + client.close().await; + handler.abort(); + server_task.abort(); + Ok(()) +} diff --git a/substrate/Cargo.toml b/substrate/Cargo.toml index f68e00a..4285065 100644 --- a/substrate/Cargo.toml +++ b/substrate/Cargo.toml @@ -11,7 +11,12 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } quinn = { version = "0.11" } rustls = { version = "0.23" } +rustls-pemfile = "2" +rustls-pki-types = "1" tokio = { version = "1", features = ["full"] } uuid = { version = "1.23", features = ["v4"] } figment = { version = "0.10", features = ["toml", "env"] } -serde = { version = "1", features = ["derive"] } \ No newline at end of file +serde = { version = "1", features = ["derive"] } +metrics = "0.24" +metrics-exporter-prometheus = "0.17" +memory-stats = "1" \ No newline at end of file diff --git a/substrate/src/config.rs b/substrate/src/config.rs index 2976fd2..9972982 100644 --- a/substrate/src/config.rs +++ b/substrate/src/config.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; pub struct AppConfig { pub network: QuicConfig, pub simulation: SimulationConfig, + pub observability: ObservabilityConfig, } #[derive(Debug, Serialize, Deserialize)] @@ -23,6 +24,15 @@ pub struct QuicConfig { pub server_key: String, } +#[derive(Debug, Serialize, Deserialize)] +pub struct ObservabilityConfig { + /// When true, install the Prometheus exporter at startup. Disable for + /// environments where the metrics port collides or scraping is undesired. + pub metrics_enabled: bool, + /// Bind address for the `/metrics` HTTP listener. + pub metrics_listen: String, +} + impl Default for AppConfig { fn default() -> Self { Self { @@ -36,6 +46,10 @@ impl Default for AppConfig { tick_rate_hz: 60, max_entities: 10000, }, + observability: ObservabilityConfig { + metrics_enabled: true, + metrics_listen: "0.0.0.0:9100".to_string(), + }, } } } diff --git a/substrate/src/lib.rs b/substrate/src/lib.rs new file mode 100644 index 0000000..424c353 --- /dev/null +++ b/substrate/src/lib.rs @@ -0,0 +1,4 @@ +pub mod config; +pub mod observability; +pub mod transport; +pub mod world; diff --git a/substrate/src/main.rs b/substrate/src/main.rs index 362db06..b6f001e 100644 --- a/substrate/src/main.rs +++ b/substrate/src/main.rs @@ -1,9 +1,10 @@ -mod transport; -mod config; - use bevy::prelude::*; use tracing_subscriber::EnvFilter; -use crate::config::AppConfig; + +use substrate::config::AppConfig; +use substrate::observability::ObservabilityPlugin; +use substrate::transport; +use substrate::world::WorldPlugin; fn main() { tracing_subscriber::fmt() @@ -12,12 +13,22 @@ fn main() { ) .init(); + // rustls 0.23 requires an explicit default crypto provider. Quinn's + // ServerConfig::with_single_cert otherwise panics at first use. + rustls::crypto::aws_lc_rs::default_provider() + .install_default() + .expect("install rustls default crypto provider"); + let config = AppConfig::load("config.toml").expect("Failed to load config"); tracing::info!(?config, "substrate starting"); + // Plugin order matters: EcsQuicTransportPlugin inserts the TokioHandle + // resource ObservabilityPlugin reads in its `build()`. App::new() .insert_resource(config) .add_plugins(MinimalPlugins) - .add_plugins(transport::ecs::EcsQuicTransportPlugin {}) + .add_plugins(transport::ecs::EcsQuicTransportPlugin) + .add_plugins(WorldPlugin) + .add_plugins(ObservabilityPlugin) .run(); } diff --git a/substrate/src/observability.rs b/substrate/src/observability.rs new file mode 100644 index 0000000..e87e080 --- /dev/null +++ b/substrate/src/observability.rs @@ -0,0 +1,116 @@ +//! M5 — Prometheus-format `/metrics` exporter installation and counter +//! pre-registration. +//! +//! Counters and histograms are emitted from the demux path +//! ([`crate::transport::server`]) and the world systems +//! ([`crate::world::ingest_system`], [`crate::world::simulation_system`], +//! [`crate::world::export_system`]). This module's only job is: +//! +//! 1. Install the global metrics recorder + HTTP listener on the existing +//! tokio runtime, once at startup. +//! 2. Pre-register every counter at value 0 so panels render "0" rather than +//! "No data" before the first event of a given kind fires. +//! +//! ## Runtime telemetry +//! +//! - `substrate_received_total{tier=t1|t2|t3}` — counter +//! - `substrate_dropped_total{tier=t1}` — counter (T1 lossy) +//! - `substrate_decode_errors_total{tier=t1|t2|t3}` — counter +//! - `substrate_t3_no_handler_total` — counter +//! - `substrate_latency_us{tier=t1|t2|t3}` — histogram +//! - `substrate_tick_hz` — gauge +//! - `substrate_entities` — gauge +//! - `substrate_channel_depth{tier=t1|t2|t3}` — gauge +//! - `substrate_channel_capacity{tier=t1|t2|t3}` — gauge +//! - `substrate_rss_bytes` — gauge +//! +//! ## Digital-twin surface (operator dashboard) +//! +//! - `sensor_aggregate{type=…, stat=count|mean|min|max}` — gauge +//! - `substrate_threshold_crossings_total{type, direction}` — counter + +use std::net::SocketAddr; + +use bevy::prelude::*; +use metrics::counter; +use metrics_exporter_prometheus::PrometheusBuilder; + +use crate::config::AppConfig; +use crate::transport::SensorType; +use crate::transport::ecs::TokioHandle; + +pub struct ObservabilityPlugin; + +impl Plugin for ObservabilityPlugin { + fn build(&self, app: &mut App) { + let config = app + .world() + .get_resource::() + .expect("AppConfig must be inserted before ObservabilityPlugin"); + + if !config.observability.metrics_enabled { + tracing::info!("metrics exporter disabled by config"); + return; + } + + let listen: SocketAddr = config + .observability + .metrics_listen + .parse() + .expect("invalid metrics_listen address in config"); + + let runtime_handle = app + .world() + .get_resource::() + .expect("TokioHandle must be inserted before ObservabilityPlugin (load order: transport plugin first)") + .0 + .clone(); + + // PrometheusBuilder::install spawns the HTTP listener via tokio::spawn, + // which requires being inside a runtime context. + let _guard = runtime_handle.enter(); + PrometheusBuilder::new() + .with_http_listener(listen) + .install() + .expect("install prometheus exporter"); + drop(_guard); + + tracing::info!(?listen, "metrics exporter installed"); + + pre_register_counters(); + } +} + +/// Pre-register every counter at value 0 so Grafana sees a series to plot +/// even before the first event of that kind. Without this, the Prometheus +/// exporter omits any counter that has never been incremented, and panels +/// render "No data" — confusing when the metric exists, the counter is just +/// genuinely zero (e.g., `substrate_t3_no_handler_total` in normal operation). +fn pre_register_counters() { + for tier in ["t1", "t2", "t3"] { + counter!("substrate_received_total", "tier" => tier).increment(0); + counter!("substrate_decode_errors_total", "tier" => tier).increment(0); + } + counter!("substrate_dropped_total", "tier" => "t1").increment(0); + counter!("substrate_t3_no_handler_total").increment(0); + + // Threshold crossings — bounded `|SensorType| × 2` cardinality, all + // pre-registered so dashboard panels show "0" instead of "No data". + for t in [ + SensorType::Generic, + SensorType::Temperature, + SensorType::Humidity, + SensorType::Pressure, + SensorType::Voltage, + SensorType::Current, + ] { + for direction in ["up", "down"] { + counter!( + "substrate_threshold_crossings_total", + "type" => t.label_str(), + "direction" => direction + ) + .increment(0); + } + } +} diff --git a/substrate/src/transport/ecs.rs b/substrate/src/transport/ecs.rs index ba38fdc..72e295e 100644 --- a/substrate/src/transport/ecs.rs +++ b/substrate/src/transport/ecs.rs @@ -1,68 +1,111 @@ use std::sync::Mutex; + use bevy::prelude::*; +use bevy::state::app::StatesPlugin; +use tokio::runtime::Handle; use tokio::sync::mpsc; -use crate::transport::QuicMessage; -use crate::transport::server::run_substrate_server; + +use crate::config::AppConfig; +use crate::transport::{QuicMessage, T1Sender, T2Sender, T3Inbound, T3Sender}; +use crate::transport::server::{accept_loop, bind_endpoint}; +use crate::transport::state::ServerState; const T1_CAPACITY: usize = 1024; const T2_CAPACITY: usize = 512; const T3_CAPACITY: usize = 256; -pub struct EcsQuicTransportPlugin{} +pub struct EcsQuicTransportPlugin; +/// Receive halves of the three tier channels, wrapped so they can sit in a +/// Bevy `Resource`. The `world` module's ingest system is the sole reader. #[derive(Resource)] -struct BridgeReceivers { - t1: Mutex>, - t2: Mutex>, - t3: Mutex>, +pub(crate) struct BridgeReceivers { + pub(crate) t1: Mutex>, + pub(crate) t2: Mutex>, + pub(crate) t3: Mutex>, } -fn ingest_system(bridge: Res){ - let mut t1 = bridge.t1.lock().unwrap(); - // Tier 1: drain up to N messages, drop the rest - for _ in 0..T1_CAPACITY { - match t1.try_recv() { - Ok(msg) => { /* write RawSensorData */ } - Err(_) => break, - } - } - - // T2/T3: drain completely, these are low volume - let mut t2 = bridge.t2.lock().unwrap(); - while let Ok(msg) = t2.try_recv() { /* ... */ } - - let mut t3 = bridge.t3.lock().unwrap(); - while let Ok(msg) = t3.try_recv() { /* ... */ } +#[derive(Resource, Clone)] +pub(crate) struct BridgeSenders { + pub(crate) t1: T1Sender, + pub(crate) t2: T2Sender, + pub(crate) t3: T3Sender, } -impl Plugin for EcsQuicTransportPlugin{ +#[derive(Resource, Clone)] +pub(crate) struct TokioHandle(pub(crate) Handle); + +/// Bring up the QUIC listener using the loaded `AppConfig` and transition to +/// `ServerState::Started`. Runs once via `OnEnter(ServerState::Starting)`. +fn start_quic_server( + config: Res, + senders: Res, + runtime: Res, + mut next: ResMut>, +) { + tracing::info!("entering ServerState::Starting — bringing up QUIC listener"); + + // `Endpoint::server` is sync but needs a tokio runtime context for + // `Handle::current()`; entering the runtime is enough — no async block + // required. + let _guard = runtime.0.enter(); + let endpoint = bind_endpoint(&config.network).expect("failed to bind QUIC endpoint"); + drop(_guard); + + tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC listener bound"); + + let s = senders.clone(); + runtime.0.spawn(accept_loop(endpoint, s.t1, s.t2, s.t3)); + + next.set(ServerState::Started); + tracing::info!("ServerState::Started"); +} + +impl Plugin for EcsQuicTransportPlugin { fn build(&self, app: &mut App) { - // Create the channels for multi-thread communication - let (t1_tx, t1_rx) = - mpsc::channel::(T1_CAPACITY); - let (t2_tx, t2_rx) = - mpsc::channel::(T2_CAPACITY); - let (t3_tx, t3_rx) = - mpsc::channel::(T3_CAPACITY); + // Three-tier bridge between the tokio-side QUIC accept loop and the + // ECS PreUpdate ingest system (in the `world` module). + let (t1_tx, t1_rx) = mpsc::channel::(T1_CAPACITY); + let (t2_tx, t2_rx) = mpsc::channel::(T2_CAPACITY); + let (t3_tx, t3_rx) = mpsc::channel::(T3_CAPACITY); - let quic_handle = std::thread::spawn(move || { - let rt = tokio::runtime::Builder::new_multi_thread() - .worker_threads(2) - .enable_all() - .build() - .unwrap(); + // Spawn a tokio runtime on a dedicated OS thread, ship its Handle back + // to the ECS, and keep the runtime alive for the lifetime of the app + // by parking on `pending()`. + let (handle_tx, handle_rx) = std::sync::mpsc::sync_channel::(1); + std::thread::Builder::new() + .name("quic-runtime".to_string()) + .spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .thread_name("quic-worker") + .build() + .expect("build tokio runtime"); + handle_tx + .send(rt.handle().clone()) + .expect("send tokio Handle to ECS"); + rt.block_on(std::future::pending::<()>()); + }) + .expect("spawn quic-runtime thread"); - rt.block_on(async move { - run_substrate_server(t1_tx, t2_tx, t3_tx).await; - }); - }); + let handle = handle_rx.recv().expect("receive tokio Handle"); - app.insert_resource(BridgeReceivers { - t1: Mutex::new(t1_rx), - t2: Mutex::new(t2_rx), - t3: Mutex::new(t3_rx), - }); - - app.add_systems(PreUpdate, ingest_system); + // Bevy 0.18 split state machinery into its own plugin; under + // MinimalPlugins it isn't installed by default. + app.add_plugins(StatesPlugin) + .init_state::() + .insert_resource(TokioHandle(handle)) + .insert_resource(BridgeSenders { + t1: T1Sender::new(t1_tx), + t2: T2Sender::new(t2_tx), + t3: T3Sender::new(t3_tx), + }) + .insert_resource(BridgeReceivers { + t1: Mutex::new(t1_rx), + t2: Mutex::new(t2_rx), + t3: Mutex::new(t3_rx), + }) + .add_systems(OnEnter(ServerState::Starting), start_quic_server); } -} \ No newline at end of file +} diff --git a/substrate/src/transport/mod.rs b/substrate/src/transport/mod.rs index e008371..5536015 100644 --- a/substrate/src/transport/mod.rs +++ b/substrate/src/transport/mod.rs @@ -1,27 +1,100 @@ pub mod ecs; -mod server; +pub mod server; +pub mod state; -/// One sensor sample on the wire. +use tokio::sync::{mpsc, oneshot}; + +/// Logical type of a sensor reading. Travels in `QuicMessage::sensor_type` +/// so the substrate (and any downstream dashboard) knows which units / range +/// / visualisation applies to the `raw_value`. /// -/// Fixed 38-byte little-endian layout — same on x86_64 and aarch64 (the two +/// Forward compat: unknown discriminants decode as `Generic`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +#[repr(u8)] +pub enum SensorType { + #[default] + Generic = 0, + Temperature = 1, + Humidity = 2, + Pressure = 3, + Voltage = 4, + Current = 5, +} + +impl SensorType { + pub fn from_u8(b: u8) -> Self { + match b { + 1 => Self::Temperature, + 2 => Self::Humidity, + 3 => Self::Pressure, + 4 => Self::Voltage, + 5 => Self::Current, + _ => Self::Generic, + } + } + + pub fn as_u8(self) -> u8 { + self as u8 + } + + /// Lowercase label used as a Prometheus label value. + pub fn label_str(self) -> &'static str { + match self { + Self::Generic => "generic", + Self::Temperature => "temperature", + Self::Humidity => "humidity", + Self::Pressure => "pressure", + Self::Voltage => "voltage", + Self::Current => "current", + } + } + + /// SI / engineering unit string for Grafana axis labels. + pub fn unit_str(self) -> &'static str { + match self { + Self::Generic => "", + Self::Temperature => "°C", + Self::Humidity => "%", + Self::Pressure => "hPa", + Self::Voltage => "V", + Self::Current => "A", + } + } +} + +/// One sample (T1/T2 sensor reading or T3 actuator command/ack) on the wire. +/// +/// Fixed 39-byte little-endian layout — same on x86_64 and aarch64 (the two /// evaluation hosts), so encode/decode is effectively a memcpy. /// /// ```text /// offset size field /// ------ ---- -------------------------- /// 0 16 device_id (UUID) -/// 16 2 data_stream_id (u16) +/// 16 2 sensor_id (u16) /// 18 8 raw_value (f64) /// 26 8 timestamp_us (u64) /// 34 4 sequence_number (u32) +/// 38 1 sensor_type (u8 — `SensorType` discriminant) /// ``` +/// +/// Field semantics: +/// - `device_id` — UUID of the originating device (or target, for T3 commands). +/// - `sensor_id` — logical sensor/actuator on that device (per-device index). +/// - `raw_value` — sensor reading (T1/T2) or actuator setpoint/feedback (T3). +/// - `timestamp_us` — capture time on the device clock for T1/T2; server-side +/// ack time on T3 replies. +/// - `sequence_number` — monotonic counter per `(device_id, sensor_id)` for +/// T1/T2; correlation id linking T3 command and ack. +/// - `sensor_type` — `SensorType` discriminant, decoded via `SensorType::from_u8`. #[derive(Debug, Clone, Default, Copy, PartialEq)] pub struct QuicMessage { pub device_id: uuid::Uuid, - pub data_stream_id: u16, + pub sensor_id: u16, pub raw_value: f64, pub timestamp_us: u64, pub sequence_number: u32, + pub sensor_type: u8, } #[derive(Debug, thiserror::Error)] @@ -32,7 +105,7 @@ pub enum WireError { impl QuicMessage { /// Bytes on the wire — fixed-size, no length prefix. - pub const WIRE_SIZE: usize = 38; + pub const WIRE_SIZE: usize = 39; pub fn encode_to(&self, buf: &mut [u8]) -> Result<(), WireError> { if buf.len() != Self::WIRE_SIZE { @@ -42,10 +115,11 @@ impl QuicMessage { }); } buf[0..16].copy_from_slice(self.device_id.as_bytes()); - buf[16..18].copy_from_slice(&self.data_stream_id.to_le_bytes()); + buf[16..18].copy_from_slice(&self.sensor_id.to_le_bytes()); buf[18..26].copy_from_slice(&self.raw_value.to_le_bytes()); buf[26..34].copy_from_slice(&self.timestamp_us.to_le_bytes()); buf[34..38].copy_from_slice(&self.sequence_number.to_le_bytes()); + buf[38] = self.sensor_type; Ok(()) } @@ -66,12 +140,113 @@ impl QuicMessage { id_bytes.copy_from_slice(&buf[0..16]); Ok(Self { device_id: uuid::Uuid::from_bytes(id_bytes), - data_stream_id: u16::from_le_bytes(buf[16..18].try_into().unwrap()), + sensor_id: u16::from_le_bytes(buf[16..18].try_into().unwrap()), raw_value: f64::from_le_bytes(buf[18..26].try_into().unwrap()), timestamp_us: u64::from_le_bytes(buf[26..34].try_into().unwrap()), sequence_number: u32::from_le_bytes(buf[34..38].try_into().unwrap()), + sensor_type: buf[38], }) } + + /// Convenience accessor — decodes `sensor_type` to the typed enum. + pub fn typ(&self) -> SensorType { + SensorType::from_u8(self.sensor_type) + } +} + +// --- Per-tier bridge senders ----------------------------------------------- +// +// Three newtypes encode the paper's tier semantics into the type system so +// the demux can't mix them up: +// +// * T1 (datagrams) — lossy; `try_send` drops on full +// * T2 (uni streams) — reliable, ordered; `send().await` backpressures +// * T3 (bi streams) — reliable command + per-command oneshot reply + +/// Tier 1 — high-frequency telemetry over QUIC datagrams. Full channel drops. +#[derive(Clone)] +pub struct T1Sender { + inner: mpsc::Sender, +} + +impl T1Sender { + pub fn new(inner: mpsc::Sender) -> Self { + Self { inner } + } + + /// Returns `true` if queued, `false` if dropped (channel full or closed). + pub fn send_lossy(&self, msg: QuicMessage) -> bool { + self.inner.try_send(msg).is_ok() + } + + /// Currently queued messages — used for channel-depth gauges. + pub fn depth(&self) -> usize { + self.inner.max_capacity().saturating_sub(self.inner.capacity()) + } + + pub fn capacity(&self) -> usize { + self.inner.max_capacity() + } +} + +/// Tier 2 — ordered events over a QUIC unidirectional stream. Awaits on full. +#[derive(Clone)] +pub struct T2Sender { + inner: mpsc::Sender, +} + +impl T2Sender { + pub fn new(inner: mpsc::Sender) -> Self { + Self { inner } + } + + pub async fn send( + &self, + msg: QuicMessage, + ) -> Result<(), mpsc::error::SendError> { + self.inner.send(msg).await + } + + pub fn depth(&self) -> usize { + self.inner.max_capacity().saturating_sub(self.inner.capacity()) + } + + pub fn capacity(&self) -> usize { + self.inner.max_capacity() + } +} + +/// Tier 3 — actuator command on a QUIC bidirectional stream, paired with a +/// `oneshot` channel the ECS uses to write the ack back over the same stream. +pub struct T3Inbound { + pub command: QuicMessage, + pub reply: oneshot::Sender, +} + +#[derive(Clone)] +pub struct T3Sender { + inner: mpsc::Sender, +} + +impl T3Sender { + pub fn new(inner: mpsc::Sender) -> Self { + Self { inner } + } + + pub async fn send( + &self, + inbound: T3Inbound, + ) -> Result<(), mpsc::error::SendError> { + self.inner.send(inbound).await + } + + pub fn depth(&self) -> usize { + self.inner.max_capacity().saturating_sub(self.inner.capacity()) + } + + pub fn capacity(&self) -> usize { + self.inner.max_capacity() + } } #[cfg(test)] @@ -80,33 +255,35 @@ mod tests { #[test] fn wire_size_matches_fields() { - assert_eq!(QuicMessage::WIRE_SIZE, 16 + 2 + 8 + 8 + 4); + assert_eq!(QuicMessage::WIRE_SIZE, 16 + 2 + 8 + 8 + 4 + 1); } #[test] fn roundtrip_preserves_all_fields() { let msg = QuicMessage { device_id: uuid::Uuid::from_u128(0x0123456789abcdef_fedcba9876543210), - data_stream_id: 0xBEEF, + sensor_id: 0xBEEF, raw_value: -273.15, timestamp_us: 1_700_000_000_000_001, sequence_number: 42, + sensor_type: SensorType::Temperature.as_u8(), }; let bytes = msg.to_bytes(); assert_eq!(bytes.len(), QuicMessage::WIRE_SIZE); let decoded = QuicMessage::decode(&bytes).unwrap(); assert_eq!(msg, decoded); + assert_eq!(decoded.typ(), SensorType::Temperature); } #[test] fn decode_rejects_wrong_length() { assert!(matches!( - QuicMessage::decode(&[0u8; 37]), - Err(WireError::BadLength { expected: 38, got: 37 }) + QuicMessage::decode(&[0u8; 38]), + Err(WireError::BadLength { expected: 39, got: 38 }) )); assert!(matches!( - QuicMessage::decode(&[0u8; 39]), - Err(WireError::BadLength { expected: 38, got: 39 }) + QuicMessage::decode(&[0u8; 40]), + Err(WireError::BadLength { expected: 39, got: 40 }) )); } @@ -114,13 +291,22 @@ mod tests { fn encode_layout_is_little_endian() { let msg = QuicMessage { device_id: uuid::Uuid::nil(), - data_stream_id: 0x0102, + sensor_id: 0x0102, raw_value: 0.0, timestamp_us: 0, sequence_number: 0x04030201, + sensor_type: SensorType::Humidity.as_u8(), }; let bytes = msg.to_bytes(); assert_eq!(&bytes[16..18], &[0x02, 0x01]); assert_eq!(&bytes[34..38], &[0x01, 0x02, 0x03, 0x04]); + assert_eq!(bytes[38], SensorType::Humidity.as_u8()); + } + + #[test] + fn unknown_sensor_type_decodes_as_generic() { + assert_eq!(SensorType::from_u8(0), SensorType::Generic); + assert_eq!(SensorType::from_u8(99), SensorType::Generic); + assert_eq!(SensorType::from_u8(255), SensorType::Generic); } } diff --git a/substrate/src/transport/server.rs b/substrate/src/transport/server.rs index 99d000f..770fdb7 100644 --- a/substrate/src/transport/server.rs +++ b/substrate/src/transport/server.rs @@ -1,8 +1,350 @@ -use tokio::sync::mpsc::Sender; -use crate::transport::QuicMessage; +use std::net::SocketAddr; +use std::sync::Arc; -pub async fn run_substrate_server(t1_tx: Sender, - t2_tx: Sender, - t3_tx: Sender) { - -} \ No newline at end of file +use anyhow::{Context, anyhow}; +use metrics::counter; +use quinn::{ + Connection, Endpoint, Incoming, RecvStream, SendStream, ServerConfig, StreamId, TransportConfig, +}; +use rustls_pki_types::{CertificateDer, PrivateKeyDer}; +use tokio::sync::oneshot; + +use crate::config::QuicConfig; +use crate::transport::{QuicMessage, T1Sender, T2Sender, T3Inbound, T3Sender}; + +/// Datagram receive buffer in bytes. Sized to absorb microbursts at the +/// telemetry rates. +const DATAGRAM_RECV_BUFFER_BYTES: usize = 256 * 1024; + +/// Load the cert chain + private key from disk and build a Quinn `ServerConfig`. +pub fn build_server_config(cfg: &QuicConfig) -> anyhow::Result { + let cert_pem = std::fs::read(&cfg.server_cert) + .with_context(|| format!("read server_cert at {}", cfg.server_cert))?; + let key_pem = std::fs::read(&cfg.server_key) + .with_context(|| format!("read server_key at {}", cfg.server_key))?; + + let certs: Vec> = rustls_pemfile::certs(&mut cert_pem.as_slice()) + .collect::>() + .with_context(|| format!("parse PEM certs at {}", cfg.server_cert))?; + if certs.is_empty() { + return Err(anyhow!("no certificates found in {}", cfg.server_cert)); + } + + let key: PrivateKeyDer<'static> = rustls_pemfile::private_key(&mut key_pem.as_slice()) + .with_context(|| format!("parse PEM key at {}", cfg.server_key))? + .ok_or_else(|| anyhow!("no private key found in {}", cfg.server_key))?; + + let mut server_config = + ServerConfig::with_single_cert(certs, key).context("build Quinn ServerConfig")?; + + // Explicit transport config so the values driving evaluation are visible + // in source and at startup, not buried in Quinn's defaults. + let mut transport = TransportConfig::default(); + transport.datagram_receive_buffer_size(Some(DATAGRAM_RECV_BUFFER_BYTES)); + server_config.transport = Arc::new(transport); + + tracing::info!( + datagram_recv_buffer_bytes = DATAGRAM_RECV_BUFFER_BYTES, + "Quinn TransportConfig tuned" + ); + + Ok(server_config) +} + +/// Bind the listener. Must be called from inside a tokio runtime context +/// (Quinn relies on `Handle::current()` internally). +pub fn bind_endpoint(cfg: &QuicConfig) -> anyhow::Result { + let server_config = build_server_config(cfg)?; + let addr: SocketAddr = format!("{}:{}", cfg.server_interface, cfg.server_port) + .parse() + .with_context(|| { + format!( + "invalid bind address {}:{}", + cfg.server_interface, cfg.server_port + ) + })?; + Endpoint::server(server_config, addr).context("Endpoint::server bind") +} + +/// Accept loop: per-connection senders are cloned from the tier handles and +/// shipped into `handle_incoming` for orchestration. +pub async fn accept_loop(endpoint: Endpoint, t1: T1Sender, t2: T2Sender, t3: T3Sender) { + tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC accept loop running"); + while let Some(incoming) = endpoint.accept().await { + let t1 = t1.clone(); + let t2 = t2.clone(); + let t3 = t3.clone(); + tokio::spawn(handle_incoming(incoming, t1, t2, t3)); + } + tracing::info!("QUIC accept loop exited"); +} + +/// Per-connection orchestrator. Performs the handshake and spawns one reader +/// per tier, then waits for the connection to close and joins the readers. +async fn handle_incoming(incoming: Incoming, t1: T1Sender, t2: T2Sender, t3: T3Sender) { + let conn = match incoming.await { + Ok(c) => c, + Err(e) => { + tracing::warn!(error = %e, "handshake failed"); + return; + } + }; + let remote = conn.remote_address(); + tracing::info!(?remote, "connection established"); + + // One task per tier — fully wired across T1/T2/T3. + let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1)); + let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2)); + let bi_task = tokio::spawn(accept_bi_streams(conn.clone(), t3)); + + let _ = conn.closed().await; + + if let Err(e) = dgram_task.await { + tracing::warn!(?remote, error = %e, "T1 datagram task ended unexpectedly"); + } + if let Err(e) = uni_task.await { + tracing::warn!(?remote, error = %e, "T2 uni stream task ended unexpectedly"); + } + if let Err(e) = bi_task.await { + tracing::warn!(?remote, error = %e, "T3 bi stream task ended unexpectedly"); + } + tracing::info!(?remote, "connection closed"); +} + +/// T1 — read QUIC datagrams, decode each as a fixed-size `QuicMessage`, push +/// into the lossy T1 channel. +async fn read_datagrams(conn: Connection, t1: T1Sender) { + let remote = conn.remote_address(); + let mut received: u64 = 0; + let mut dropped: u64 = 0; + let mut decode_errors: u64 = 0; + + loop { + match conn.read_datagram().await { + Ok(bytes) => match QuicMessage::decode(&bytes[..]) { + Ok(msg) => { + received += 1; + counter!("substrate_received_total", "tier" => "t1").increment(1); + if !t1.send_lossy(msg) { + dropped += 1; + counter!("substrate_dropped_total", "tier" => "t1").increment(1); + tracing::trace!(?remote, "T1 channel full, datagram dropped"); + } + } + Err(e) => { + decode_errors += 1; + counter!("substrate_decode_errors_total", "tier" => "t1").increment(1); + tracing::warn!( + ?remote, + len = bytes.len(), + error = %e, + "T1 datagram decode failed" + ); + } + }, + Err(e) => { + tracing::debug!( + ?remote, + received, + dropped, + decode_errors, + error = %e, + "T1 datagram reader ended" + ); + return; + } + } + } +} + +/// T2 — accept unidirectional streams. Each accepted stream gets its own task +/// reading 38-byte chunks until EOF (one stream may carry one event or many). +/// Cross-stream interleaving is allowed; ordering is only guaranteed *within* +/// a stream, matching QUIC's stream semantics. +async fn read_uni_streams(conn: Connection, t2: T2Sender) { + let remote = conn.remote_address(); + let mut streams_accepted: u64 = 0; + + loop { + let recv = match conn.accept_uni().await { + Ok(s) => s, + Err(e) => { + tracing::debug!( + ?remote, + streams_accepted, + error = %e, + "T2 uni accept loop ended" + ); + return; + } + }; + streams_accepted += 1; + let t2 = t2.clone(); + tokio::spawn(read_one_uni_stream(remote, recv, t2)); + } +} + +/// Per-stream worker for T2. Reads fixed-size `QuicMessage`s back-to-back, +/// awaits backpressure on the T2 channel, and resets the stream on a decode +/// failure (one corrupt stream shouldn't take down the whole connection). +async fn read_one_uni_stream(remote: SocketAddr, mut recv: RecvStream, t2: T2Sender) { + let stream_id: StreamId = recv.id(); + let mut buf = [0u8; QuicMessage::WIRE_SIZE]; + let mut count: u64 = 0; + + loop { + match recv.read_exact(&mut buf).await { + Ok(()) => match QuicMessage::decode(&buf) { + Ok(msg) => { + count += 1; + counter!("substrate_received_total", "tier" => "t2").increment(1); + if t2.send(msg).await.is_err() { + // T2 receiver dropped (substrate shutting down). + tracing::warn!( + ?remote, + ?stream_id, + count, + "T2 channel closed; abandoning stream" + ); + return; + } + } + Err(e) => { + counter!("substrate_decode_errors_total", "tier" => "t2").increment(1); + tracing::warn!( + ?remote, + ?stream_id, + count, + error = %e, + "T2 decode failed; resetting stream" + ); + let _ = recv.stop(0u32.into()); + return; + } + }, + Err(e) => { + tracing::trace!( + ?remote, + ?stream_id, + count, + error = %e, + "T2 uni stream ended" + ); + return; + } + } + } +} + +/// T3 — accept bidirectional streams. Each stream is one command/ack +/// exchange, modeled per the paper's "per-command oneshot channels": the +/// reader pushes a `T3Inbound { command, reply }` to the ECS, awaits the +/// response on `reply_rx`, and writes it back on the same stream. +async fn accept_bi_streams(conn: Connection, t3: T3Sender) { + let remote = conn.remote_address(); + let mut streams_accepted: u64 = 0; + + loop { + let (send, recv) = match conn.accept_bi().await { + Ok(s) => s, + Err(e) => { + tracing::debug!( + ?remote, + streams_accepted, + error = %e, + "T3 bi accept loop ended" + ); + return; + } + }; + streams_accepted += 1; + let t3 = t3.clone(); + tokio::spawn(read_one_bi_stream(remote, send, recv, t3)); + } +} + +/// Per-stream worker for T3. Reads exactly one command, ships it with a +/// `oneshot::Sender` to the ECS, awaits the reply, writes it back. If the +/// ECS drops the oneshot (no handler installed), the stream is reset so the +/// client sees an explicit reset instead of a half-open stream. +async fn read_one_bi_stream( + remote: SocketAddr, + mut send: SendStream, + mut recv: RecvStream, + t3: T3Sender, +) { + let stream_id: StreamId = recv.id(); + + let mut buf = [0u8; QuicMessage::WIRE_SIZE]; + if let Err(e) = recv.read_exact(&mut buf).await { + tracing::trace!( + ?remote, + ?stream_id, + error = %e, + "T3: incomplete command read; closing" + ); + return; + } + let command = match QuicMessage::decode(&buf) { + Ok(m) => m, + Err(e) => { + counter!("substrate_decode_errors_total", "tier" => "t3").increment(1); + tracing::warn!( + ?remote, + ?stream_id, + error = %e, + "T3 command decode failed; resetting stream" + ); + let _ = recv.stop(0u32.into()); + let _ = send.reset(0u32.into()); + return; + } + }; + counter!("substrate_received_total", "tier" => "t3").increment(1); + + let (reply_tx, reply_rx) = oneshot::channel::(); + let inbound = T3Inbound { + command, + reply: reply_tx, + }; + if t3.send(inbound).await.is_err() { + tracing::warn!(?remote, ?stream_id, "T3 channel closed; abandoning command"); + let _ = send.reset(0u32.into()); + return; + } + + let response = match reply_rx.await { + Ok(msg) => msg, + Err(_) => { + // ECS dropped the oneshot. With M4's handler installed this + // shouldn't happen normally; if it does, the stream is reset so + // the client sees a clean signal. + counter!("substrate_t3_no_handler_total").increment(1); + tracing::debug!( + ?remote, + ?stream_id, + "T3: no handler for command, resetting stream" + ); + let _ = send.reset(0u32.into()); + return; + } + }; + + if let Err(e) = send.write_all(&response.to_bytes()).await { + tracing::warn!( + ?remote, + ?stream_id, + error = %e, + "T3 ack write failed" + ); + return; + } + if let Err(e) = send.finish() { + tracing::warn!( + ?remote, + ?stream_id, + error = %e, + "T3 ack finish failed" + ); + } +} diff --git a/substrate/src/transport/state.rs b/substrate/src/transport/state.rs new file mode 100644 index 0000000..8325090 --- /dev/null +++ b/substrate/src/transport/state.rs @@ -0,0 +1,13 @@ +use bevy::prelude::States; + +/// Lifecycle of the QUIC listener inside the ECS schedule. +/// +/// `Starting` is the default; `OnEnter(Starting)` performs the bind and, on +/// success, transitions to `Started`. A `Failed` variant will join when we +/// add proper error surfacing — for now a bind failure panics the app. +#[derive(States, Debug, Clone, Copy, Default, Eq, PartialEq, Hash)] +pub enum ServerState { + #[default] + Starting, + Started, +} diff --git a/substrate/src/world/components.rs b/substrate/src/world/components.rs new file mode 100644 index 0000000..8bd7416 --- /dev/null +++ b/substrate/src/world/components.rs @@ -0,0 +1,97 @@ +//! Components attached to per-sensor entities, plus the per-type threshold +//! table used by `simulation_system`'s crossing detection. +//! +//! Each (device, sensor) pair becomes one entity tagged with `Asset` and +//! carrying `DeviceId` + `SensorId` + `SensorTypeTag` + `RawSensorData` + +//! `SmoothedValue`. + +use bevy::prelude::*; + +use crate::transport::SensorType; + +/// Marker — every (device, sensor) pair becomes one entity tagged `Asset`. +#[derive(Component, Debug, Default, Clone, Copy)] +pub struct Asset; + +#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct DeviceId(pub uuid::Uuid); + +#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SensorId(pub u16); + +/// Sensor type — set on entity creation from the first message that names +/// the (device, sensor) pair, then immutable. We don't track type changes: +/// a given (device_id, sensor_id) is one logical sensor with one type for +/// the lifetime of the run. +#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SensorTypeTag(pub SensorType); + +/// Latest reading from this (device, sensor). Updated in place by +/// `ingest_system`; read by simulation/export/diagnostics. +#[derive(Component, Debug, Default, Clone, Copy, PartialEq)] +pub struct RawSensorData { + pub raw_value: f64, + pub timestamp_us: u64, + pub sequence_number: u32, +} + +pub const SMOOTHED_WINDOW: usize = 16; + +/// Rolling-window mean of the last `SMOOTHED_WINDOW` raw readings, plus a +/// hysteresis flag for threshold-crossing detection. Maintained by +/// `simulation_system` — this is the bit of the ECS that does honest +/// digital-twin transform work, not just write-through of incoming samples. +#[derive(Component, Debug, Clone, Copy)] +pub struct SmoothedValue { + ring: [f64; SMOOTHED_WINDOW], + head: usize, + filled: u16, + pub mean: f64, + pub above_threshold: bool, +} + +impl Default for SmoothedValue { + fn default() -> Self { + Self { + ring: [0.0; SMOOTHED_WINDOW], + head: 0, + filled: 0, + mean: 0.0, + above_threshold: false, + } + } +} + +impl SmoothedValue { + /// Push a new sample. Non-finite values (NaN / ±∞) are ignored — the + /// smoothed state stays whatever it was. This matters because T3 acks + /// can carry NaN when the substrate has never seen the target sensor. + pub fn push(&mut self, v: f64) { + if !v.is_finite() { + return; + } + self.ring[self.head] = v; + self.head = (self.head + 1) % SMOOTHED_WINDOW; + if (self.filled as usize) < SMOOTHED_WINDOW { + self.filled += 1; + } + let n = self.filled as usize; + let sum: f64 = self.ring.iter().take(n).sum(); + self.mean = sum / n as f64; + } +} + +/// Per-type threshold for `simulation_system`'s crossing detection. Chosen +/// mid-band against the simulator's waveforms so crossings actually fire +/// during a demo; in a real deployment these would be alarm thresholds +/// supplied by config. +pub(super) fn threshold_for(t: SensorType) -> f64 { + match t { + SensorType::Generic => 0.0, + SensorType::Temperature => 22.0, // °C — simulator oscillates 15..25 + SensorType::Humidity => 55.0, // % — 30..70 + SensorType::Pressure => 1014.0, // hPa — 1008..1018 + SensorType::Voltage => 230.2, // V — 229.5..230.5 + SensorType::Current => 10.5, // A — 8..12 + } +} diff --git a/substrate/src/world/mod.rs b/substrate/src/world/mod.rs new file mode 100644 index 0000000..db6c59f --- /dev/null +++ b/substrate/src/world/mod.rs @@ -0,0 +1,52 @@ +//! ECS world: the five paper-named systems plus the components and resources +//! they operate on. +//! +//! ```text +//! components.rs ── per-sensor components + per-type threshold table +//! resources.rs ── SensorRegistry, DiagnosticsState, ExportSampleState +//! systems.rs ── ingest / fault_injection / simulation / export / diagnostics +//! tests.rs ── unit tests (#[cfg(test)] only) +//! ``` +//! +//! Each (device, sensor) pair becomes one entity with `Asset` + `DeviceId` + +//! `SensorId` + `SensorTypeTag` + `RawSensorData` + `SmoothedValue`. +//! `ingest_system` upserts on every incoming `QuicMessage`; the registry maps +//! `(Uuid, u16) → Entity` for O(1) lookup. + +mod components; +mod resources; +mod systems; + +#[cfg(test)] +mod tests; + +use bevy::prelude::*; +use bevy::state::condition::in_state; + +use crate::transport::state::ServerState; + +pub use components::{ + Asset, DeviceId, RawSensorData, SMOOTHED_WINDOW, SensorId, SensorTypeTag, SmoothedValue, +}; +pub use resources::SensorRegistry; + +pub struct WorldPlugin; + +impl Plugin for WorldPlugin { + fn build(&self, app: &mut App) { + app.init_resource::() + .init_resource::() + .init_resource::() + .add_systems( + PreUpdate, + (systems::fault_injection_system, systems::ingest_system) + .chain() + .run_if(in_state(ServerState::Started)), + ) + .add_systems(Update, systems::simulation_system) + .add_systems( + PostUpdate, + (systems::export_system, systems::diagnostics_system).chain(), + ); + } +} diff --git a/substrate/src/world/resources.rs b/substrate/src/world/resources.rs new file mode 100644 index 0000000..f79d6d4 --- /dev/null +++ b/substrate/src/world/resources.rs @@ -0,0 +1,48 @@ +//! Bevy `Resource`s consumed by the world's systems. + +use std::collections::HashMap; +use std::time::Instant; + +use bevy::prelude::{Entity, Resource}; + +/// O(1) lookup `(device_id, sensor_id) → Entity`. Populated lazily by the +/// ingest system; queried by export/diagnostics. +#[derive(Resource, Default)] +pub struct SensorRegistry { + pub(crate) map: HashMap<(uuid::Uuid, u16), Entity>, +} + +impl SensorRegistry { + pub fn entity_count(&self) -> usize { + self.map.len() + } +} + +/// Rolling counter of ticks since the last `diagnostics` log line was emitted. +#[derive(Resource)] +pub(super) struct DiagnosticsState { + pub(super) last_log: Instant, + pub(super) ticks_since_log: u64, +} + +impl Default for DiagnosticsState { + fn default() -> Self { + Self { + last_log: Instant::now(), + ticks_since_log: 0, + } + } +} + +/// Rate-limiter for `export_system` — runs at the ECS tick rate but only +/// emits gauges once per second. +#[derive(Resource)] +pub(super) struct ExportSampleState { + pub(super) last_sample: Instant, +} + +impl Default for ExportSampleState { + fn default() -> Self { + Self { last_sample: Instant::now() } + } +} diff --git a/substrate/src/world/systems.rs b/substrate/src/world/systems.rs new file mode 100644 index 0000000..14b800a --- /dev/null +++ b/substrate/src/world/systems.rs @@ -0,0 +1,278 @@ +//! The five paper-named ECS systems and their private helpers. +//! +//! Scheduler placement (configured in [`super::WorldPlugin`]): +//! +//! | Schedule | Systems | +//! |-----------|--------------------------------------| +//! | PreUpdate | fault_injection → ingest | +//! | Update | simulation | +//! | PostUpdate| export → diagnostics | + +use std::collections::HashMap; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +use bevy::prelude::*; +use metrics::{counter, gauge, histogram}; + +use crate::transport::ecs::{BridgeReceivers, BridgeSenders}; +use crate::transport::{QuicMessage, SensorType}; + +use super::components::{ + Asset, DeviceId, RawSensorData, SensorId, SensorTypeTag, SmoothedValue, threshold_for, +}; +use super::resources::{DiagnosticsState, ExportSampleState, SensorRegistry}; + +/// T1 batch limit per tick. Anything beyond this stays in the channel and +/// either drains next tick or gets dropped on full (T1's contract is lossy). +const T1_INGEST_BATCH: usize = 1024; + +/// Drain the three tier channels into ECS state. +/// +/// T1: bounded batch (lossy); T2: full drain (reliable); T3: full drain, with +/// each command answered by an ack carrying the device's current sensor value. +pub(super) fn ingest_system( + bridge: Res, + mut registry: ResMut, + mut commands: Commands, + mut q: Query<&mut RawSensorData>, +) { + let now = now_us(); + + // T1 — datagrams. + { + let mut t1 = bridge.t1.lock().unwrap(); + for _ in 0..T1_INGEST_BATCH { + match t1.try_recv() { + Ok(msg) => { + histogram!("substrate_latency_us", "tier" => "t1") + .record(now.saturating_sub(msg.timestamp_us) as f64); + upsert_reading(&mut registry, &mut commands, &mut q, msg); + } + Err(_) => break, + } + } + } + + // T2 — uni streams. + { + let mut t2 = bridge.t2.lock().unwrap(); + while let Ok(msg) = t2.try_recv() { + histogram!("substrate_latency_us", "tier" => "t2") + .record(now.saturating_sub(msg.timestamp_us) as f64); + upsert_reading(&mut registry, &mut commands, &mut q, msg); + } + } + + // T3 — bidirectional commands. Reply with the device's most recent + // sensor value (NaN if we've never seen this (device, sensor) before). + { + let mut t3 = bridge.t3.lock().unwrap(); + while let Ok(inbound) = t3.try_recv() { + histogram!("substrate_latency_us", "tier" => "t3") + .record(now.saturating_sub(inbound.command.timestamp_us) as f64); + let key = (inbound.command.device_id, inbound.command.sensor_id); + let current_value = registry + .map + .get(&key) + .and_then(|&e| q.get(e).ok()) + .map(|d| d.raw_value) + .unwrap_or(f64::NAN); + let ack = QuicMessage { + device_id: inbound.command.device_id, + sensor_id: inbound.command.sensor_id, + raw_value: current_value, + timestamp_us: now_us(), + sequence_number: inbound.command.sequence_number, + sensor_type: inbound.command.sensor_type, + }; + // Ignore send errors: the demux task may have given up if the + // connection died while we were processing. + let _ = inbound.reply.send(ack); + } + } +} + +fn upsert_reading( + registry: &mut SensorRegistry, + commands: &mut Commands, + q: &mut Query<&mut RawSensorData>, + msg: QuicMessage, +) { + let key = (msg.device_id, msg.sensor_id); + let data = RawSensorData { + raw_value: msg.raw_value, + timestamp_us: msg.timestamp_us, + sequence_number: msg.sequence_number, + }; + + if let Some(&entity) = registry.map.get(&key) { + // Common case: existing entity, mutate in place. + if let Ok(mut existing) = q.get_mut(entity) { + *existing = data; + } else { + // Edge case: entity was registered earlier in *this* tick via + // `commands.spawn`, so the components aren't in the archetype + // yet (`Commands` is deferred). Queue another insert; last write + // wins when Commands flushes. + commands.entity(entity).insert(data); + } + return; + } + + let entity = commands + .spawn(( + Asset, + DeviceId(msg.device_id), + SensorId(msg.sensor_id), + SensorTypeTag(SensorType::from_u8(msg.sensor_type)), + SmoothedValue::default(), + data, + )) + .id(); + registry.map.insert(key, entity); +} + +/// Stub — M6 inserts loss/delay here for benchmark scenarios. +pub(super) fn fault_injection_system() {} + +/// Per-sensor digital-twin transform. Pulls each entity's latest +/// `RawSensorData` into a sliding-window mean (`SmoothedValue`), and emits +/// `substrate_threshold_crossings_total{type, direction}` when that mean +/// transitions across the per-type threshold. The `Changed` +/// filter restricts the scan to entities updated *this tick*, so the cost +/// scales with ingress rate, not fleet size. +pub(super) fn simulation_system( + mut q: Query<(&SensorTypeTag, &RawSensorData, &mut SmoothedValue), Changed>, +) { + for (st, raw, mut smoothed) in q.iter_mut() { + smoothed.push(raw.raw_value); + let now_above = smoothed.mean > threshold_for(st.0); + if now_above != smoothed.above_threshold { + smoothed.above_threshold = now_above; + let dir = if now_above { "up" } else { "down" }; + counter!( + "substrate_threshold_crossings_total", + "type" => st.0.label_str(), + "direction" => dir + ) + .increment(1); + } + } +} + +/// Sample ECS-side gauges into the Prometheus exporter. Runs every tick but +/// only emits once per second to keep cost negligible. This is the system +/// the paper's §Architecture diagram calls `ExportSystem`. +pub(super) fn export_system( + senders: Res, + registry: Res, + sensors_q: Query<(&SensorTypeTag, &RawSensorData)>, + mut state: ResMut, +) { + let now = Instant::now(); + if now.duration_since(state.last_sample) < Duration::from_secs(1) { + return; + } + state.last_sample = now; + + // ---- runtime telemetry ---- + gauge!("substrate_entities").set(registry.entity_count() as f64); + + gauge!("substrate_channel_depth", "tier" => "t1").set(senders.t1.depth() as f64); + gauge!("substrate_channel_depth", "tier" => "t2").set(senders.t2.depth() as f64); + gauge!("substrate_channel_depth", "tier" => "t3").set(senders.t3.depth() as f64); + + gauge!("substrate_channel_capacity", "tier" => "t1").set(senders.t1.capacity() as f64); + gauge!("substrate_channel_capacity", "tier" => "t2").set(senders.t2.capacity() as f64); + gauge!("substrate_channel_capacity", "tier" => "t3").set(senders.t3.capacity() as f64); + + if let Some(stats) = memory_stats::memory_stats() { + gauge!("substrate_rss_bytes").set(stats.physical_mem as f64); + } + + // ---- sensor data aggregates (per type) ---- + let mut by_type: HashMap<&'static str, Aggregate> = HashMap::new(); + for (st, data) in &sensors_q { + by_type + .entry(st.0.label_str()) + .or_insert_with(Aggregate::new) + .push(data.raw_value); + } + for (label, agg) in &by_type { + gauge!("sensor_aggregate", "type" => *label, "stat" => "count").set(agg.count as f64); + if agg.count > 0 { + gauge!("sensor_aggregate", "type" => *label, "stat" => "mean").set(agg.mean()); + gauge!("sensor_aggregate", "type" => *label, "stat" => "min").set(agg.min); + gauge!("sensor_aggregate", "type" => *label, "stat" => "max").set(agg.max); + } + } +} + +pub(super) fn diagnostics_system( + mut state: ResMut, + registry: Res, +) { + state.ticks_since_log += 1; + let now = Instant::now(); + let elapsed = now.duration_since(state.last_log); + if elapsed >= Duration::from_secs(1) { + let tick_hz = state.ticks_since_log as f64 / elapsed.as_secs_f64(); + gauge!("substrate_tick_hz").set(tick_hz); + tracing::info!( + tick_hz = format_args!("{:.1}", tick_hz), + entities = registry.entity_count(), + "diagnostics" + ); + state.last_log = now; + state.ticks_since_log = 0; + } +} + +fn now_us() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_micros() as u64) + .unwrap_or(0) +} + +/// Per-type accumulator for `export_system`'s sensor aggregates. NaN-safe. +#[derive(Debug, Clone, Copy)] +struct Aggregate { + count: u64, + sum: f64, + min: f64, + max: f64, +} + +impl Aggregate { + fn new() -> Self { + Self { + count: 0, + sum: 0.0, + min: f64::INFINITY, + max: f64::NEG_INFINITY, + } + } + + fn push(&mut self, v: f64) { + if !v.is_finite() { + return; + } + self.count += 1; + self.sum += v; + if v < self.min { + self.min = v; + } + if v > self.max { + self.max = v; + } + } + + fn mean(&self) -> f64 { + if self.count == 0 { + f64::NAN + } else { + self.sum / self.count as f64 + } + } +} diff --git a/substrate/src/world/tests.rs b/substrate/src/world/tests.rs new file mode 100644 index 0000000..73374e8 --- /dev/null +++ b/substrate/src/world/tests.rs @@ -0,0 +1,294 @@ +//! Unit tests for the world's components and systems. +//! +//! Lives as a child module so it can poke at `pub(super)` items (the +//! internal resources, `threshold_for`, etc.) without enlarging the +//! public API. + +use std::sync::Mutex; + +use bevy::prelude::*; +use bevy::state::app::StatesPlugin; +use tokio::sync::{mpsc, oneshot}; +use uuid::Uuid; + +use crate::transport::ecs::{BridgeReceivers, BridgeSenders}; +use crate::transport::state::ServerState; +use crate::transport::{QuicMessage, SensorType, T1Sender, T2Sender, T3Inbound, T3Sender}; + +use super::WorldPlugin; +use super::components::{RawSensorData, SMOOTHED_WINDOW, SmoothedValue, threshold_for}; +use super::resources::SensorRegistry; + +/// Build a Bevy app with just enough plugins/resources to run the world +/// systems against test-owned channels. No QUIC, no tokio runtime. +fn make_test_app() -> ( + App, + mpsc::Sender, + mpsc::Sender, + mpsc::Sender, +) { + let (t1_tx, t1_rx) = mpsc::channel::(64); + let (t2_tx, t2_rx) = mpsc::channel::(64); + let (t3_tx, t3_rx) = mpsc::channel::(64); + + let bridge = BridgeReceivers { + t1: Mutex::new(t1_rx), + t2: Mutex::new(t2_rx), + t3: Mutex::new(t3_rx), + }; + // export_system samples channel depth/capacity from the senders; it + // requires the resource even when the test pushes via the raw senders + // directly (which is what the rest of the test does). + let senders = BridgeSenders { + t1: T1Sender::new(t1_tx.clone()), + t2: T2Sender::new(t2_tx.clone()), + t3: T3Sender::new(t3_tx.clone()), + }; + + let mut app = App::new(); + app.add_plugins(MinimalPlugins) + .add_plugins(StatesPlugin) + .init_state::() + .insert_resource(bridge) + .insert_resource(senders) + .add_plugins(WorldPlugin); + + // Force the state machine into Started so the run_if guard passes. + app.world_mut() + .resource_mut::>() + .set(ServerState::Started); + // Process the state transition before tests push messages. + app.update(); + + (app, t1_tx, t2_tx, t3_tx) +} + +// ---- ingest_system: entity lifecycle and T3 ack semantics ---- + +#[test] +fn ingest_t1_creates_entity_and_writes_raw_data() { + let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app(); + + let device = Uuid::from_u128(0xa1a2_a3a4_a5a6_a7a8_a9aa_abac_adae_afb0); + let msg = QuicMessage { + device_id: device, + sensor_id: 5, + raw_value: 3.14, + timestamp_us: 1_700_000_000_000_001, + sequence_number: 1, + sensor_type: SensorType::Temperature.as_u8(), + }; + t1_tx.try_send(msg).expect("channel cap"); + + // Tick 1: ingest drains the channel and spawns via Commands. + app.update(); + // Tick 2: Commands have flushed into the archetype. + app.update(); + + let registry = app.world().resource::(); + assert_eq!(registry.map.len(), 1); + + let entity = *registry + .map + .get(&(device, 5)) + .expect("entity not registered"); + let data = app + .world() + .get::(entity) + .expect("RawSensorData missing"); + assert_eq!(data.raw_value, 3.14); + assert_eq!(data.sequence_number, 1); + assert_eq!(data.timestamp_us, 1_700_000_000_000_001); +} + +#[test] +fn ingest_t1_repeated_messages_update_in_place() { + let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app(); + let device = Uuid::new_v4(); + + // First reading. + t1_tx + .try_send(QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: 1.0, + timestamp_us: 1, + sequence_number: 1, + sensor_type: SensorType::Generic.as_u8(), + }) + .unwrap(); + app.update(); + app.update(); + + // Second reading on the same (device, sensor). + t1_tx + .try_send(QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: 2.0, + timestamp_us: 2, + sequence_number: 2, + sensor_type: SensorType::Generic.as_u8(), + }) + .unwrap(); + app.update(); + + let registry = app.world().resource::(); + assert_eq!(registry.map.len(), 1, "should reuse the same entity"); + + let entity = *registry.map.get(&(device, 0)).unwrap(); + let data = app.world().get::(entity).unwrap(); + assert_eq!(data.raw_value, 2.0); + assert_eq!(data.sequence_number, 2); +} + +#[test] +fn ingest_t3_replies_with_current_sensor_value() { + let (mut app, t1_tx, _t2_tx, t3_tx) = make_test_app(); + let device = Uuid::new_v4(); + + // Seed a T1 reading so the (device, sensor) entity exists. + t1_tx + .try_send(QuicMessage { + device_id: device, + sensor_id: 9, + raw_value: 42.0, + timestamp_us: 1, + sequence_number: 1, + sensor_type: SensorType::Temperature.as_u8(), + }) + .unwrap(); + app.update(); + app.update(); + + // Send a T3 command and capture the ack via the oneshot. + let (reply_tx, reply_rx) = oneshot::channel(); + t3_tx + .try_send(T3Inbound { + command: QuicMessage { + device_id: device, + sensor_id: 9, + raw_value: 0.0, + timestamp_us: 0, + sequence_number: 7, + sensor_type: SensorType::Temperature.as_u8(), + }, + reply: reply_tx, + }) + .unwrap(); + app.update(); + + let ack = reply_rx + .blocking_recv() + .expect("ECS handler should have replied"); + assert_eq!(ack.device_id, device); + assert_eq!(ack.sensor_id, 9); + assert_eq!(ack.sequence_number, 7, "ack preserves correlation id"); + assert_eq!(ack.raw_value, 42.0, "ack carries the latest sensor reading"); + assert_eq!( + ack.typ(), + SensorType::Temperature, + "ack preserves sensor type" + ); + assert!(ack.timestamp_us > 0, "ack stamped with server time"); +} + +// ---- SmoothedValue unit tests ---- + +#[test] +fn smoothed_value_first_push_sets_mean() { + let mut s = SmoothedValue::default(); + s.push(10.0); + assert_eq!(s.mean, 10.0); + assert!(!s.above_threshold); +} + +#[test] +fn smoothed_value_averages_filled_window() { + let mut s = SmoothedValue::default(); + for v in [1.0, 2.0, 3.0, 4.0] { + s.push(v); + } + assert!((s.mean - 2.5).abs() < 1e-9); +} + +#[test] +fn smoothed_value_rolls_after_window_fills() { + let mut s = SmoothedValue::default(); + for _ in 0..SMOOTHED_WINDOW { + s.push(0.0); + } + assert!((s.mean - 0.0).abs() < 1e-9); + for _ in 0..SMOOTHED_WINDOW { + s.push(10.0); + } + assert!((s.mean - 10.0).abs() < 1e-9, "ring should fully roll over"); +} + +#[test] +fn smoothed_value_ignores_nonfinite() { + let mut s = SmoothedValue::default(); + s.push(5.0); + let before = s.mean; + s.push(f64::NAN); + s.push(f64::INFINITY); + s.push(f64::NEG_INFINITY); + assert_eq!(s.mean, before, "non-finite values should not perturb the mean"); +} + +// ---- simulation_system: end-to-end threshold-crossing transition ---- + +#[test] +fn simulation_smoothes_and_detects_threshold_crossing() { + let (mut app, t1_tx, _t2_tx, _t3_tx) = make_test_app(); + let device = Uuid::new_v4(); + let threshold = threshold_for(SensorType::Temperature); // 22.0 °C + + // Below-threshold readings: smoothed mean stays under, no crossing. + for seq in 0..SMOOTHED_WINDOW as u32 { + t1_tx + .try_send(QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: 18.0, + timestamp_us: u64::from(seq), + sequence_number: seq, + sensor_type: SensorType::Temperature.as_u8(), + }) + .unwrap(); + app.update(); + app.update(); + } + + let registry = app.world().resource::(); + let entity = *registry.map.get(&(device, 0)).unwrap(); + let smoothed = app + .world() + .get::(entity) + .expect("SmoothedValue should be on every sensor entity"); + assert!(smoothed.mean < threshold); + assert!(!smoothed.above_threshold, "should not have crossed up yet"); + + // Above-threshold readings: enough samples to drag the mean above + // the threshold (window = 16; pushing 30°C for 16 ticks lands mean ≈ 30). + for seq in (SMOOTHED_WINDOW as u32)..(SMOOTHED_WINDOW as u32 * 2) { + t1_tx + .try_send(QuicMessage { + device_id: device, + sensor_id: 0, + raw_value: 30.0, + timestamp_us: u64::from(seq), + sequence_number: seq, + sensor_type: SensorType::Temperature.as_u8(), + }) + .unwrap(); + app.update(); + } + + let smoothed = app.world().get::(entity).unwrap(); + assert!(smoothed.mean > threshold); + assert!( + smoothed.above_threshold, + "smoothed mean should have crossed up through {threshold}" + ); +}