Compare commits

..

19 Commits

Author SHA1 Message Date
Valère Plantevin
a7b8065739 Update to the text and small demo 2026-05-13 17:22:10 -04:00
Valère Plantevin
872bbb8c2c Update to data 2026-05-13 16:39:27 -04:00
09c51f95b4 Updated benchamrk 2026-05-13 16:25:12 -04:00
8a699719a2 Benchmark with bidi netem 2026-05-13 16:06:11 -04:00
Valère Plantevin
ac8a319b40 Update to scripts to better handle netem 2026-05-13 15:40:09 -04:00
Valère Plantevin
f226e53118 Results and script to verify netem 2026-05-13 15:32:23 -04:00
89630238a9 End of benchmark 2026-05-13 15:22:15 -04:00
Valère Plantevin
baa075fe0f Flip T3 to substrate-initiated actuator commands 2026-05-13 15:03:23 -04:00
272d3b3c59 First trial run on CM5 2026-05-13 10:56:35 -04:00
Valère Plantevin
1722fea41f Update to script 2026-05-13 10:31:45 -04:00
Valère Plantevin
0174019b3f Update dependencies for Bevy 2026-05-13 10:19:15 -04:00
Valère Plantevin
8465a7c952 Remove unnecessary dependencies 2026-05-13 10:11:42 -04:00
Valère Plantevin
6e60c760b0 Update to scripts 2026-05-13 09:58:30 -04:00
Valère Plantevin
7f54aea439 Prototype of the first automation stull ugly 2026-05-12 14:00:12 -04:00
Valère Plantevin
20d59ed0ba Getting ready for the final test 2026-05-12 13:24:03 -04:00
Valère Plantevin
5d2552efb5 Enhance substrate ingest limits and optimize simulator stream reuse 2026-05-12 11:44:01 -04:00
Valère Plantevin
d3f09ee062 First test kinda working 2026-05-12 11:21:40 -04:00
Valère Plantevin
cac6c9ac02 Cleanup before network implementation 2026-05-04 16:53:14 -04:00
Valère Plantevin
4ec5b98df4 Add config and basic architecture for QUIC 2026-05-04 16:13:40 -04:00
53 changed files with 5458 additions and 175 deletions

3
.gitignore vendored
View File

@@ -21,6 +21,9 @@ analysis/.venv/
# Data — raw CSVs committed, processed outputs not # Data — raw CSVs committed, processed outputs not
data/**/*_processed.csv data/**/*_processed.csv
# Self-signed dev TLS material — regenerate with `make certs`
certs/
# OS # OS
.DS_Store .DS_Store
*.swp *.swp

10
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

4
.idea/encodings.xml generated Normal file
View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with BOM under Windows, with no BOM otherwise" />
</project>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
</profile>
</component>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/quic_ecs_dt.iml" filepath="$PROJECT_DIR$/.idea/quic_ecs_dt.iml" />
</modules>
</component>
</project>

6
.idea/prettier.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PrettierConfiguration">
<option name="myConfigurationMode" value="AUTOMATIC" />
</component>
</project>

12
.idea/quic_ecs_dt.iml generated Normal file
View File

@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="EMPTY_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/simulator/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/substrate/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

216
CLAUDE.md Normal file
View File

@@ -0,0 +1,216 @@
# quic_ecs_dt — Project Guide for Claude
## What & why
Source repo for **"QUIC and ECS as Complementary Transport and Runtime Substrates for Industrial Digital Twins: An Integrated Empirical Study"** — submitted to **UCAmI 2026** (Track 2: *Internet of EveryThing (IoT, People & Processes) and Sensors*; primary topic *IoE interoperability, integration and performance*, secondary topic *IoE experimental results and deployment scenarios*). Single-author (Plantevin, UQAC). Third paper in a sequence; the first two are at IEEE SWC 2026:
- `plantevin2026ecs` — ECS as runtime substrate for industrial DT (200k assets @ 114 Hz on Pi 5).
- `plantevin2026quic` — QUIC partial reliability for DT sensor streams (94% P99 reduction vs TCP at 5% loss).
**UCAmI hypothesis (the composition question):** prior work shows ECS and QUIC each work as substrates *independently*. Does integrating real QUIC traffic into a Bevy ECS ingest path introduce coupling that degrades either one's claimed properties? The paper argues no, and measures it on a real CM5 ↔ M4 Max two-machine deployment.
## Architecture
Three-tier QUIC ↔ ECS bridge, headless Bevy runtime. **T1/T2 are inbound (device → substrate); T3 is outbound (substrate → device, actuator commands):**
| Tier | QUIC primitive | Direction | Use case | Channel cap | Sender |
|------|----------------|-----------|----------|-------------|--------|
| T1 | Unreliable datagrams (RFC 9221) | device → substrate | High-freq ephemeral telemetry; drops OK | 1024 | `T1Sender::send_lossy` (try_send, drop on full) |
| T2 | Unidirectional streams | device → substrate | Ordered threshold events; reliable | 512 | `T2Sender::send` (await, backpressure) |
| T3 | Bidirectional streams | **substrate → device** | Actuator commands w/ ACK | 256 | `T3OutboundSender::try_send` of `OutboundT3 { target_device, sensor_id, raw_value, sensor_type }` |
QUIC server runs on a dedicated OS thread with a Tokio multi-thread runtime. T1/T2 decoded `QuicMessage`s (39 B fixed LE: 16 UUID + 2 sensor_id + 8 f64 + 8 ts + 4 seq + 1 sensor_type) flow into per-tier `tokio::sync::mpsc` channels and are drained by Bevy's `ingest_system` in `PreUpdate`, gated by `run_if(in_state(ServerState::Started))`. T3 flows the other way: `automation_system` constructs `OutboundT3` items and the tokio-side `drain_outbound_t3` task opens bi-streams to the target device. The per-tier sender newtypes (in [substrate/src/transport/mod.rs](substrate/src/transport/mod.rs)) make tier mixups a type error. Pattern in [substrate/src/transport/ecs.rs](substrate/src/transport/ecs.rs).
**T3 actuator-command protocol.** The substrate's `automation_system` decides to actuate (e.g. Presence < 1.0 ⇒ Relay = stop) and pushes an `OutboundT3` onto the outbound channel. The tokio `drain_outbound_t3` pops it, looks up the target device's `quinn::Connection` in a `ConnectionRegistry` (populated by `read_datagrams` / `read_one_uni_stream` on first sight of each device UUID), then **spawns one task per command** to do `conn.open_bi() → write 39 B → finish → read 39 B ack`. Per-task spawning means a single stuck `read_exact` can't stall the pipeline. Latency from `open_bi()` to ack-receipt is recorded as `substrate_latency_us{tier="t3"}` and a successful ack increments `substrate_received_total{tier="t3"}`. Misses (`substrate_t3_outbound_no_route_total`), drops (`substrate_t3_outbound_dropped_total`), and bi-stream errors (`substrate_t3_outbound_errors_total`) each have their own counter.
**Connection registry.** `Arc<std::sync::RwLock<HashMap<Uuid, quinn::Connection>>>`. `quinn::Connection` is internally `Arc`; one simulator process commonly hosts 7 device UUIDs sharing one connection. Registry insert is idempotent (`ensure_registered`). On `conn.closed().await` returning, `handle_incoming` purges every key whose `Connection::stable_id()` matches the closed connection.
**Target hardware:** CM5 (BCM2712, Cortex-A76, 4 GB) as DT runtime; M4 Max as traffic generator; 1 Gbps direct Ethernet. Both rigs are in hand; benchmark sweeps live on the CM5.
## Repo map
```
quic_ecs_dt/
├── paper/ Quarto + LNCS source — single index.qmd, refs in references.bib
├── substrate/ Rust crate: Bevy 0.18 + Quinn 0.11 + rustls 0.23 + Tokio
│ └── src/
│ ├── main.rs App::new, MinimalPlugins, EcsQuicTransportPlugin, ObservabilityPlugin
│ ├── lib.rs re-exports
│ ├── config.rs figment chain: defaults → config.toml → APP_* env (split on "__")
│ ├── observability.rs metrics-exporter-prometheus on :9100
│ ├── transport/
│ │ ├── mod.rs QuicMessage codec + tier sender newtypes + OutboundT3
│ │ ├── ecs.rs EcsQuicTransportPlugin: tokio thread + bridge + registry + drain spawn
│ │ ├── server.rs bind_endpoint + accept_loop + read_datagrams + read_uni_streams
│ │ │ + drain_outbound_t3 + synthetic_t3_driver + ConnectionRegistry
│ │ └── state.rs ServerState{Starting, Started}
│ └── world/
│ ├── mod.rs WorldPlugin (5 systems wired into Pre/Update/Post)
│ ├── components.rs Asset, DeviceId, SensorId, SensorTypeTag, RawSensorData, SmoothedValue, threshold_for
│ ├── resources.rs SensorRegistry, DiagnosticsState, ExportSampleState
│ ├── systems.rs ingest, simulation, automation, export, diagnostics
│ └── tests.rs 8 unit tests inc. automation_dispatches_relay_stop
├── simulator/ Rust crate: Quinn client + sensor generators + T3 receiver
│ ├── src/
│ │ ├── main.rs CLI driver + HTTP-trigger task + T1 inline loop
│ │ ├── lib.rs module exports
│ │ ├── client.rs SimulatorClient (connect, send_datagram, send_uni_stream, request, close)
│ │ ├── commands.rs run_command_receiver (substrate → device T3 accept-bi loop)
│ │ ├── emitters.rs run_t2_emitter (T1 lives inline in main.rs)
│ │ └── profile.rs SensorProfile (single | industrial), generate_value
│ └── tests/ T1, T2, end-to-end full-loop integration tests
├── data/
│ ├── two_machine/ CM5 ↔ M4 Max sweep — final_table.csv (load-bearing for the paper)
│ └── local/ loopback sweeps (scaling.csv, cross_tier.csv)
├── scripts/
│ ├── bench-loss.sh M6 sweep entities×loss → data/two_machine/final_table.csv
│ ├── bench-scaling.sh T1 rate sweep + optional synthetic-T3 cross-tier mode
│ ├── bench-client.sh M8 client driver (run from Mac when substrate is on CM5)
│ ├── demo.sh full-stack demo: certs + build + VM/Grafana + sub + sim
│ ├── setup-cm5.sh CM5 provisioning (apt + cargo install)
│ └── verify-netem.sh confirm tc-netem is shaping in the right direction (BIDI=1 for ifb mode)
├── monitoring/ docker-compose: VictoriaMetrics + Grafana auto-provisioned
├── dashboards/ runtime.json + sensors.json
├── certs/ gitignored, regenerated by `make certs`
├── Cargo.toml workspace
└── Makefile render, preview, build, build-cm5, deploy-cm5, monitoring-up
```
## Status
**Code (substrate + simulator):**
| Area | State |
|------|-------|
| `AppConfig` figment loader (defaults → TOML → env with `__` split) | Done — [substrate/src/config.rs](substrate/src/config.rs). Env override actually works (`Env::prefixed("APP_").split("__")`); discovered late that the previous chain silently ignored env vars |
| 39 B wire codec | Done — [substrate/src/transport/mod.rs](substrate/src/transport/mod.rs), 5 unit tests |
| Quinn server lifecycle + TLS | Done — `bind_endpoint` + `accept_loop` in [substrate/src/transport/server.rs](substrate/src/transport/server.rs); `ServerState{Starting, Started}` in [state.rs](substrate/src/transport/state.rs); explicit `TransportConfig` w/ 256 KiB datagram recv buffer; dev cert via `make certs`, rustls `aws-lc-rs` provider installed in [main.rs](substrate/src/main.rs) |
| T1 demux (datagrams → ECS) | Done. `read_datagrams` reader; decode errors non-fatal; channel-full drops silent; per-stream counters in debug summary. Calls `ensure_registered` on first decode so outbound T3 can route to this device |
| T2 demux (uni streams → ECS) | Done. `read_uni_streams` accepts streams, spawns one task per stream that reads 39 B chunks until EOF; decode failure resets the stream via `recv.stop(0)`; `t2.send().await` honours backpressure; first decode also calls `ensure_registered` |
| T3 outbound (ECS → device) | Done. `drain_outbound_t3` task pops `OutboundT3` items, looks up the target device's `Connection` in `ConnectionRegistry`, **spawns one task per command** to do `open_bi → write 39 B → finish → read ack`. Per-task spawning prevents a single stuck `read_exact` from stalling the pipeline. Records `substrate_latency_us{tier="t3"}` on success; counts no-route / dropped / errors separately. The old simulator-initiated T3 inbound path (`T3Sender` / `T3Inbound` / `accept_bi_streams`) is **gone** |
| Connection registry (Uuid → Connection) | Done — `Arc<RwLock<HashMap<Uuid, quinn::Connection>>>`; idempotent insert via `ensure_registered`; purged in `handle_incoming` after `conn.closed().await` using `Connection::stable_id()` |
| Synthetic T3 driver (bench only) | Done. `synthetic_t3_driver` task in [server.rs](substrate/src/transport/server.rs) spawned by `accept_loop` when `APP_NETWORK__SYNTHETIC_T3_RATE_HZ > 0`. Round-robins over registered devices, toggles `raw_value` between 0/1, pushes through the same outbound channel `automation_system` uses |
| ECS components + 5 systems | Done — [world/](substrate/src/world/). Entities = `(Asset, DeviceId, SensorId, SensorTypeTag, RawSensorData, SmoothedValue)` per (device, sensor). 5 systems: `ingest` (PreUpdate, drains T1+T2), `simulation` (Update, rolling mean + threshold-crossings counter), `automation` (Update, Presence-cross → `t3_out.try_send(OutboundT3{Relay setpoint})` + local mirror), `export` (PostUpdate, per-second metric sample), `diagnostics` (PostUpdate, per-second `tick_hz` log) |
| Schedule rate-gating | Done — `MinimalPlugins.set(ScheduleRunnerPlugin::run_loop(1/tick_rate_hz))` in [main.rs](substrate/src/main.rs) |
| Prometheus exporter + Grafana | Done. `metrics-exporter-prometheus` on :9100 via `ObservabilityPlugin`. Runtime metrics: `substrate_received_total{tier}`, `substrate_dropped_total{tier=t1}`, `substrate_decode_errors_total{tier}`, `substrate_t3_outbound_*_total`, `substrate_latency_us{tier}` histograms, `substrate_tick_hz`, `substrate_entities`, `substrate_channel_depth{tier}`, `substrate_rss_bytes`. Sensor data: `sensor_aggregate{type, stat=count\|mean\|min\|max}`. Dashboards: [dashboards/runtime.json](dashboards/runtime.json) + [dashboards/sensors.json](dashboards/sensors.json) |
| Simulator binary | Done — [simulator/src/main.rs](simulator/src/main.rs). Clap flags: `--addr`, `--server-name`, `--cert`, `--profile {single, industrial}`, `--sensor-type`, `--sensor-id`, `--rate-hz`, `--t2-rate-hz`, `--count`, `--devices`. `industrial` profile fans out to **7 sensors per device** on ids 0..6 (Temperature/Humidity/Pressure/Voltage/Current/Presence/Relay). HTTP trigger on `:9002` (`POST /trigger`) pushes Presence=0 over T2 — operator-facing demo entry point. T1/T2 emitters check `engine_running` per tick; when `false`, Current waveform drops to ~0 while Voltage stays at ~230 V |
| Simulator command receiver | Done — [simulator/src/commands.rs](simulator/src/commands.rs). `run_command_receiver` loops on `conn.accept_bi()`, decodes 39 B, flips `engine_running` on `sensor_type == Relay` setpoints, writes 39 B ack. Spawned by `main.rs` post-connect. `new_engine_state()` constructor exported for integration tests |
| End-to-end test harness | **18 tests, all green.** 5 codec unit tests; 8 world unit tests (incl. `automation_dispatches_relay_stop_when_presence_drops`); 2 T1 + 2 T2 integration tests; 1 **full closed-loop** test (`simulator/tests/end_to_end_full_loop.rs`: Presence < 1.0 → substrate T3 → `engine_running` flips to false; then Presence > 1.0 → flips back) |
| Benchmark scripts | Done. [bench-loss.sh](scripts/bench-loss.sh) — entity × loss sweep, **bidirectional `tc-netem` via `ifb` on the CM5** (BIDI=1 default). [bench-scaling.sh](scripts/bench-scaling.sh) — T1 rate sweep + optional substrate-side `APP_NETWORK__SYNTHETIC_T3_RATE_HZ`. [verify-netem.sh](scripts/verify-netem.sh) — sanity-check netem on the right interface in the right direction (BIDI=1 mode covers ingress via ifb) |
| CM5 deploy | Done — `make build-cm5 && make deploy-cm5`; [setup-cm5.sh](scripts/setup-cm5.sh) provisions deps. Bench has been run end-to-end on CM5; data lives in [data/two_machine/final_table.csv](data/two_machine/final_table.csv) |
**Paper:**
| Area | State |
|------|-------|
| Track + topics chosen | Done — UCAmI Track 2 (IoE and Sensors); primary *IoE interoperability, integration and performance*; secondary *IoE experimental results and deployment scenarios* |
| Abstract | Done. Honest framing: "tick rate remains an order of magnitude above the cadence required" (not "stable"), mixed-reliability isolation as the T1-vs-T3 story, 0.12 MB/1k slope |
| Tables 2/3/4 from real CM5 data | Done. Native markdown tables driven by inline `{python}` values reading from `data/two_machine/final_table.csv`; cross-refs (`@tbl-latency`, `@tbl-throughput`, `@tbl-t3-rtt`) resolve in the LNCS LaTeX output. Earlier `display(Markdown(...))` approach didn't register with Quarto's cross-ref filter; switched to native md tables with inline-python cells |
| `fig-isolation` | **Dropped.** Cross-tier story now told by `tbl-latency` + `tbl-t3-rtt` (T1 flat under loss, T3 absorbs ~38 ms retransmit). Cleaner than the loopback fig. `data/local/cross_tier.csv` is still on disk but the paper no longer reads it |
| Architecture §3 + Table 1 | Updated for substrate-initiated T3. Table 1 T3 row reads "OutboundT3 enqueue + ack \| Bidirectional stream (server-initiated)"; the connection-registry / per-device routing is described in the prose |
| Implementation §4 Automation paragraph | Updated for the new outbound T3 path; describes the per-device registry, the per-command bi-stream, and the simulator-side `run_command_receiver` engine-state flip |
| Discussion + Conclusion | Honest now: drops the unbacked "<5% IngestSystem drain" and "Grafana adds no overhead" claims; conclusion populates both 0%-loss and 5%-loss Hz from data |
| Render | Clean against LNCS LaTeX template (`make render` → 10-page PDF, no Quarto warnings) |
## Roadmap
Treat the milestone log as historical. The paper-side work below tracks what's *left* before camera-ready.
- **M1 — Wire codec & root config.** ✅ 2026-05-04.
- **M2 — Quinn server + TLS.** ✅ 2026-05-06.
- **M3 — Simulator client.** ✅ Done. `SimulatorClient` + CLI driver + waveform profiles + HTTP trigger + closed-loop command receiver.
- **M4 — ECS world.** ✅ Done. 5 systems wired; automation closes the T3 loop.
- **M5 — Observability.** ✅ Done. Both dashboards live; metrics exposed via prometheus scrape.
- **M6 — Benchmark harness.** ✅ Done. `bench-loss.sh` + `bench-scaling.sh` + `verify-netem.sh` (last one added when egress-only netem was masking the inbound T1 loss path; now `ifb` ingress shaping is default).
- **M7 — CM5 cross-compile & deploy.** ✅ Done. Multiple sweeps shipped from CM5.
- **M8 — Two-machine run + paper render.** ✅ Done. Paper renders against [data/two_machine/final_table.csv](data/two_machine/final_table.csv); all inline scalars and tables populate from real numbers.
- **M9 — T3 inversion (substrate-initiated actuator commands).** ✅ 2026-05-13. The paper's Table 1 said T3 was "actuator commands" but the code had it inverted (device → substrate RPC). Refactored to match the paper: substrate opens bi-streams, simulator's `run_command_receiver` accepts. Full closed-loop integration test landed.
- **M10 — Abstract submission polish.** ⏳ In progress. Top-of-paper fixes shipped (abstract framing, contributions paragraph, Table 1 T3 row, Architecture §3 backpressure paragraph, author affiliation, `(author?)` cite markers). Remaining polish is full-paper-only (Implementation §4 module-list lies, code listing with fake types, Observability §4.2 push-vs-pull mismatch, Experimental Setup §5.1 stale tc-netem / tick counts / loopback-vs-two-machine sentence). None block abstract submission.
**Open polish items** (not blocking abstract submission):
- §4.1 *Integrated Prototype* still lists six systems including a non-existent `FaultInjection`; module list says `transport.rs` / `world.rs` / `metrics.rs` / `main.rs` but the actual layout is `transport/`, `world/`, `observability.rs`, `config.rs`, `main.rs`, `lib.rs` plus a separate `simulator` crate.
- §4.1 code listing uses fictional types (`AssetId`, `EntityMap`, `TickDiagnostics`). Easier to drop the listing than to rewrite faithfully.
- §4.2 *Observability Stack* describes a push model with InfluxDB line protocol; actual code uses `metrics-exporter-prometheus` exposing `/metrics` for VM scrape.
- §5.1 *Experimental Setup* needs three updates: tc-netem direction (now bidirectional via `ifb`), "2,000 warmup ticks and 5,000 measurement ticks" → "20 s warmup + 50 s window (wall-clock)", and drop the "loopback for latency / two-machine for throughput" sentence (all numbers are from the two-machine sweep now).
## Conventions
- **Rust:** edition 2024; workspace at root with `simulator` + `substrate`.
- **Pinned crates:** Bevy 0.18, Quinn 0.11, rustls 0.23, Tokio 1 (full), figment 0.10 (toml + env), uuid 1.23 (v4), serde 1.
- **Config:** `figment` chain — defaults → `config.toml` → env `APP_*` with `__` nesting (e.g. `APP_NETWORK__SERVER_PORT=9000`, `APP_NETWORK__SYNTHETIC_T3_RATE_HZ=100`).
- **Bevy:** headless — `MinimalPlugins` only; do not pull rendering plugins.
- **Tokio↔Bevy:** keep the dedicated-thread + mpsc pattern in [transport/ecs.rs](substrate/src/transport/ecs.rs); do not block the ECS schedule on async work.
- **Paper:** Quarto + LNCS template ([paper/_extensions/template.tex](paper/_extensions/template.tex), [paper/_quarto.yml](paper/_quarto.yml)). **Never commit `llncs.cls` or `splncs04.bst`** — CTAN licensing; download per [README.md](README.md). For tables in LaTeX target, use native markdown tables with `: Caption {#tbl-foo}` syntax and inline `{python}` cells, **not** `display(Markdown(...))` chunks — Quarto's cross-ref filter doesn't pick the latter up in LaTeX output.
- **Data:** raw CSVs under `data/` are committed; `*_processed.csv` is gitignored. Paper figures consume `data/two_machine/final_table.csv` exclusively (the previous `data/loopback/` was renamed to `data/two_machine/` once it became the real CM5 sweep).
- **Errors:** `anyhow` (with `.context()`) for internal startup paths; `thiserror` for boundary types we want to match against (e.g. `WireError` in the codec).
- **Warnings:** let real warnings show. No `#[allow(dead_code)]`, `_var` blanket suppression, or `PhantomData` shims to silence the compiler — warnings are honest TODO markers and disappear when the consuming code lands.
## Known deferrals
- **Channel ownership is per-host, not per-connection.** All connections share the same inbound mpsc channels and the outbound T3 channel. Fairness under N-device load relies on tokio scheduling. Acceptable for "one ECS world per host".
- **No graceful shutdown.** The `quic-runtime` thread parks on `pending()`; spawned tasks orphan at process exit. Fine for research runs.
- **Bind failure is fatal.** `OnEnter(Starting)` panics if `bind_endpoint` fails.
- **T3 outbound concurrency is unbounded.** `drain_outbound_t3` spawns one task per command. Under sustained T1 ingest beyond ~10k msg/s the per-command tasks queue behind the tokio scheduler and T3 P99 climbs into the hundreds of ms (throughput still holds). If we ever need strict T3 latency isolation under heavy T1 load, add a `tokio::Semaphore` cap or a dedicated runtime/thread for T3.
- **NTP drift over a long bench shifts the across-row T1 P99 baseline.** Visible in `tbl-latency` (47 ms at 50k → 28 ms at 200k). The within-row Δ is what speaks to isolation; the across-row absolutes don't. Paper caption explains this.
- **Schedule rate-gating is approximate.** Observed `tick_hz` runs ~85% of target on macOS dev; tighter on the CM5.
## Run / verify
```bash
make certs # dev TLS (ECDSA P-256, SAN: localhost/cm5.local/127.0.0.1/::1)
make build # cargo build --release native
make build-cm5 # aarch64 cross-build
make deploy-cm5 # scp to $CM5_HOST
make render # paper PDF
make preview # live-reload paper at :4848
make monitoring-up # docker-compose VM + Grafana
```
**Tests.** `cargo test --workspace` runs codec unit tests + world unit tests + 5 integration tests (T1, T2, full closed-loop) in [simulator/tests/](simulator/tests/). Each integration test calls `bind_endpoint` + `accept_loop` in-process on `127.0.0.1:0`. The full-loop test stands up the real outbound machinery (`accept_loop` + `drain_outbound_t3`) and asserts the engine-state flag flips in both directions.
**Metrics scrape.** With `metrics_enabled = true` (default):
```bash
curl http://127.0.0.1:9100/metrics
```
`make monitoring-up` brings up VictoriaMetrics + Grafana auto-provisioned at <http://localhost:3000> (admin / admin); the dashboards mount live from [dashboards/](dashboards/) so JSON edits re-import within ~10 s.
**Full-stack demo.** [scripts/demo.sh](scripts/demo.sh) brings up certs + cargo build + monitoring stack + substrate + simulator and tails the simulator's progress log. Industrial profile by default; Presence dips below threshold every few seconds, triggering substrate-initiated T3 Relay setpoints, visible on the operator dashboard as Current collapsing to ~0 A while Voltage holds.
```bash
./scripts/demo.sh # defaults
PROFILE=single RATE_HZ=100 DEVICES=20 ./scripts/demo.sh
KEEP_MONITORING=1 ./scripts/demo.sh # leave VM + Grafana running on exit
```
**Manual two-process run.** From the repo root:
```bash
# shell 1 — server
cargo run -p substrate
# shell 2 — client
cargo run -p simulator -- --profile industrial --rate-hz 100 --count 0 --devices 4
```
Simulator flags (see `cargo run -p simulator -- --help`): `--addr`, `--server-name`, `--cert`, `--profile {single, industrial}`, `--sensor-type`, `--sensor-id`, `--rate-hz` (T1 datagram rate; `0` disables T1), `--t2-rate-hz` (T2 event rate; `0` disables T2), `--count` (T1 count; `0` = until Ctrl-C), `--devices`. **No simulator-side T3 flag** — T3 is substrate-initiated. Per-second `progress` lines show `t1_sent`/`t2_sent`/`engine={running,stopped}`.
**Bidirectional netem on the CM5.** [scripts/bench-loss.sh](scripts/bench-loss.sh) applies `tc netem loss N%` bidirectionally via an `ifb` ingress-redirect (`BIDI=1` default). [scripts/verify-netem.sh](scripts/verify-netem.sh) confirms it lands on the right interface:
```bash
./scripts/verify-netem.sh <peer-ip> end0 5 # egress only
BIDI=1 ./scripts/verify-netem.sh <peer-ip> end0 5 # both directions via ifb
```
## Key references
- Prior self-citations: `plantevin2026ecs`, `plantevin2026quic` (both IEEE SWC 2026, "to appear").
- QUIC: RFC 9000 (core), RFC 9221 (unreliable datagrams).
- DT foundations: Tao et al. 2019; Grieves & Vickers 2017; Minerva et al. 2020.
- ECS: Nystrom 2014, *Game Programming Patterns*.
- Mixed-reliability transport: Peeck et al. (W2RP for DDS).
- DT sync metrics: Çakır et al. 2023 (Twin Alignment Ratio); Bellavista et al. 2023 (ODTE).
- Industrial QUIC/IIoT: Fernández et al. 2021; Boeding et al. 2025.
- Full bibliography: [paper/references.bib](paper/references.bib).

View File

@@ -1,3 +1,30 @@
[workspace] [workspace]
resolver = "3" resolver = "3"
members = ["simulator", "substrate"] members = ["simulator", "substrate"]
# Enable a small amount of optimization in the dev profile.
[profile.dev]
opt-level = 1
# Enable a large amount of optimization in the dev profile for dependencies.
[profile.dev.package."*"]
opt-level = 3
# Enable more optimization in the release profile at the cost of compile time.
[profile.release]
# Compile the entire crate as one unit.
# Slows compile times, marginal improvements.
codegen-units = 1
# Do a second optimization pass over the entire program, including dependencies.
# Slows compile times, marginal improvements.
lto = "thin"
# Optimize for size in the wasm-release profile to reduce load times and bandwidth usage on web.
[profile.wasm-release]
# Default to release profile values.
inherits = "release"
# Optimize with size in mind (also try "z", sometimes it is better).
# Slightly slows compile times, great improvements to file size and runtime performance.
opt-level = "s"
# Strip all debugging information from the binary to slightly reduce file size.
strip = "debuginfo"

View File

@@ -1,14 +1,19 @@
# ============================================================ # ============================================================
# quic_ecs_dt — top-level Makefile # quic_ecs_dt — top-level Makefile
# Targets: # Targets:
# make demo — one-shot: certs → build → VM+Grafana →
# substrate → simulator (Ctrl-C cleans up)
# make render — build the paper PDF # make render — build the paper PDF
# make preview — live-reload preview in browser # make preview — live-reload preview in browser
# make build — cargo build --release (native) # make build — cargo build --release (native)
# make build-cm5 — cargo build --release (aarch64 cross) # make build-cm5 — cargo build --release (aarch64 cross)
# make monitoring-up — start VictoriaMetrics + Grafana (docker)
# make monitoring-down — stop them
# make monitoring-logs — tail the monitoring stack
# make clean — remove generated outputs # make clean — remove generated outputs
# ============================================================ # ============================================================
.PHONY: render preview build build-cm5 clean .PHONY: render preview build build-cm5 clean certs monitoring-up monitoring-down monitoring-logs demo
VENV := $(HOME)/.venv/quic_ecs VENV := $(HOME)/.venv/quic_ecs
PYTHON := $(VENV)/bin/python PYTHON := $(VENV)/bin/python
@@ -16,6 +21,22 @@ CM5_HOST ?= 192.168.1.x
CM5_USER ?= pi CM5_USER ?= pi
CM5_BIN_DIR ?= /home/pi/quic_ecs_dt CM5_BIN_DIR ?= /home/pi/quic_ecs_dt
# Self-signed dev TLS for the QUIC server (regenerate with `make certs`).
# SAN covers loopback, ::1, and cm5.local for the two-machine setup.
CERT_DIR := certs
CERT_FILE := $(CERT_DIR)/server.crt
KEY_FILE := $(CERT_DIR)/server.key
certs: $(CERT_FILE)
$(CERT_FILE):
mkdir -p $(CERT_DIR)
openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:P-256 \
-keyout $(KEY_FILE) -out $(CERT_FILE) \
-days 3650 -nodes \
-subj "/CN=localhost/O=quic_ecs_dt-dev/OU=substrate" \
-addext "subjectAltName=DNS:localhost,DNS:cm5.local,IP:127.0.0.1,IP:::1"
# Paper # Paper
render: render:
cd paper && quarto render index.qmd cd paper && quarto render index.qmd
@@ -23,11 +44,11 @@ render:
preview: preview:
cd paper && quarto preview index.qmd --port 4848 --no-browser cd paper && quarto preview index.qmd --port 4848 --no-browser
# Rust build # Rust build (depends on dev cert so `cargo run` boots out of the box)
build: build: $(CERT_FILE)
cargo build --release cargo build --release
build-cm5: build-cm5: $(CERT_FILE)
CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \ CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
cargo build --release --target aarch64-unknown-linux-gnu cargo build --release --target aarch64-unknown-linux-gnu
@@ -37,6 +58,25 @@ deploy-cm5: build-cm5
scp target/aarch64-unknown-linux-gnu/release/quic_ecs_dt \ scp target/aarch64-unknown-linux-gnu/release/quic_ecs_dt \
$(CM5_USER)@$(CM5_HOST):$(CM5_BIN_DIR)/ $(CM5_USER)@$(CM5_HOST):$(CM5_BIN_DIR)/
# One-shot demo runner — see scripts/demo.sh
demo:
@./scripts/demo.sh
# Monitoring (VictoriaMetrics + Grafana, auto-provisioned)
monitoring-up:
docker compose -f monitoring/docker-compose.yml up -d
@echo ""
@echo "Grafana: http://localhost:3000 (admin / admin, or anonymous Admin)"
@echo " • runtime dashboard: quic_ecs_dt → quic_ecs_dt — substrate runtime"
@echo " • sensors dashboard: quic_ecs_dt → quic_ecs_dt — sensors"
@echo "VictoriaMetrics: http://localhost:8428"
monitoring-down:
docker compose -f monitoring/docker-compose.yml down
monitoring-logs:
docker compose -f monitoring/docker-compose.yml logs -f
# Clean # Clean
clean: clean:
cargo clean cargo clean

25
config.toml Normal file
View File

@@ -0,0 +1,25 @@
# quic_ecs_dt — substrate runtime config
#
# Resolution order (figment chain in substrate/src/config.rs):
# 1. compile-time defaults
# 2. this file
# 3. APP_* env vars (e.g. APP_NETWORK__SERVER_PORT=9001)
#
# All paths are resolved relative to the cwd at launch — run from the repo root.
[network]
server_port = 9000
server_interface = "0.0.0.0"
server_cert = "certs/server.crt"
server_key = "certs/server.key"
t1_capacity = 1024
t2_capacity = 512
t3_capacity = 256
[simulation]
tick_rate_hz = 60
max_entities = 10000
[observability]
metrics_enabled = true
metrics_listen = "0.0.0.0:9100"

150
dashboards/runtime.json Normal file
View File

@@ -0,0 +1,150 @@
{
"title": "quic_ecs_dt — substrate runtime",
"uid": "quic-ecs-dt-runtime",
"schemaVersion": 39,
"version": 1,
"timezone": "",
"refresh": "5s",
"time": { "from": "now-15m", "to": "now" },
"tags": ["quic_ecs_dt", "ucami2026", "substrate"],
"templating": {
"list": [
{
"name": "datasource",
"label": "Data source",
"type": "datasource",
"query": "prometheus",
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
"hide": 0
}
]
},
"panels": [
{
"id": 1,
"title": "Tick rate (Hz)",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "hertz", "decimals": 1 } },
"targets": [
{ "expr": "substrate_tick_hz", "refId": "A", "legendFormat": "tick_hz" }
]
},
{
"id": 2,
"title": "Entities",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{ "expr": "substrate_entities", "refId": "A", "legendFormat": "entities" }
]
},
{
"id": 3,
"title": "RSS",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "bytes", "decimals": 1 } },
"targets": [
{ "expr": "substrate_rss_bytes", "refId": "A", "legendFormat": "rss" }
]
},
{
"id": 4,
"title": "T3 outbound — dropped + no-route (cumulative)",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{ "expr": "substrate_t3_outbound_dropped_total", "refId": "A", "legendFormat": "dropped" },
{ "expr": "substrate_t3_outbound_no_route_total", "refId": "B", "legendFormat": "no_route" },
{ "expr": "substrate_t3_outbound_errors_total", "refId": "C", "legendFormat": "errors" }
]
},
{
"id": 5,
"title": "Per-tier receive rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "cps" } },
"targets": [
{
"expr": "rate(substrate_received_total[1m])",
"refId": "A",
"legendFormat": "received {{tier}}"
},
{
"expr": "rate(substrate_dropped_total[1m])",
"refId": "B",
"legendFormat": "dropped {{tier}}"
}
]
},
{
"id": 6,
"title": "Per-tier latency (µs)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "µs", "decimals": 0 } },
"targets": [
{
"expr": "substrate_latency_us{quantile=\"0.5\"}",
"refId": "A",
"legendFormat": "p50 {{tier}}"
},
{
"expr": "substrate_latency_us{quantile=\"0.99\"}",
"refId": "B",
"legendFormat": "p99 {{tier}}"
},
{
"expr": "substrate_latency_us{quantile=\"0.999\"}",
"refId": "C",
"legendFormat": "p999 {{tier}}"
}
]
},
{
"id": 7,
"title": "Channel depth (vs. capacity)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{
"expr": "substrate_channel_depth",
"refId": "A",
"legendFormat": "depth {{tier}}"
},
{
"expr": "substrate_channel_capacity",
"refId": "B",
"legendFormat": "capacity {{tier}}"
}
]
},
{
"id": 8,
"title": "Decode errors (rate)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "cps" } },
"targets": [
{
"expr": "rate(substrate_decode_errors_total[1m])",
"refId": "A",
"legendFormat": "decode_errors {{tier}}"
}
]
}
]
}

314
dashboards/sensors.json Normal file
View File

@@ -0,0 +1,314 @@
{
"title": "quic_ecs_dt — sensors",
"uid": "quic-ecs-dt-sensors",
"schemaVersion": 39,
"version": 1,
"timezone": "",
"refresh": "1s",
"time": { "from": "now-5m", "to": "now" },
"tags": ["quic_ecs_dt", "ucami2026", "sensors"],
"templating": {
"list": [
{
"name": "datasource",
"label": "Data source",
"type": "datasource",
"query": "prometheus",
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
"hide": 0
}
]
},
"panels": [
{
"id": 1,
"title": "Temperature — mean (thermometer)",
"type": "gauge",
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": {
"showThresholdLabels": false,
"showThresholdMarkers": true,
"orientation": "vertical"
},
"fieldConfig": {
"defaults": {
"unit": "celsius",
"decimals": 1,
"min": -20,
"max": 80,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "blue", "value": null },
{ "color": "green", "value": 10 },
{ "color": "yellow", "value": 30 },
{ "color": "orange", "value": 50 },
{ "color": "red", "value": 70 }
]
}
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"temperature\", stat=\"mean\"}",
"refId": "A",
"legendFormat": "T mean"
}
]
},
{
"id": 2,
"title": "Humidity — mean",
"type": "gauge",
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": { "showThresholdMarkers": true, "orientation": "vertical" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"decimals": 1,
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "orange", "value": null },
{ "color": "green", "value": 30 },
{ "color": "blue", "value": 70 }
]
}
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"humidity\", stat=\"mean\"}",
"refId": "A",
"legendFormat": "RH mean"
}
]
},
{
"id": 3,
"title": "Pressure — mean",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": { "graphMode": "area", "colorMode": "value" },
"fieldConfig": {
"defaults": {
"unit": "pressurehpa",
"decimals": 1,
"min": 980,
"max": 1040,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "blue", "value": null },
{ "color": "green", "value": 1000 },
{ "color": "yellow", "value": 1025 }
]
}
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"pressure\", stat=\"mean\"}",
"refId": "A",
"legendFormat": "P mean"
}
]
},
{
"id": 4,
"title": "Voltage — mean",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": { "graphMode": "area", "colorMode": "value" },
"fieldConfig": {
"defaults": {
"unit": "volt",
"decimals": 2,
"min": 220,
"max": 240,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "yellow", "value": null },
{ "color": "green", "value": 225 },
{ "color": "yellow", "value": 235 }
]
}
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"voltage\", stat=\"mean\"}",
"refId": "A",
"legendFormat": "V mean"
}
]
},
{
"id": 5,
"title": "Current — mean",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": { "graphMode": "area", "colorMode": "value" },
"fieldConfig": {
"defaults": {
"unit": "amp",
"decimals": 2,
"min": 0,
"max": 30,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 20 },
{ "color": "red", "value": 25 }
]
}
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"current\", stat=\"mean\"}",
"refId": "A",
"legendFormat": "I mean"
}
]
},
{
"id": 6,
"title": "Sensor count by type",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"options": { "colorMode": "value", "graphMode": "none" },
"targets": [
{
"expr": "sensor_aggregate{stat=\"count\"}",
"refId": "A",
"legendFormat": "{{type}}"
}
]
},
{
"id": 7,
"title": "Temperature — min / mean / max over time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "celsius", "decimals": 1 } },
"targets": [
{
"expr": "sensor_aggregate{type=\"temperature\", stat=\"min\"}",
"refId": "A",
"legendFormat": "min"
},
{
"expr": "sensor_aggregate{type=\"temperature\", stat=\"mean\"}",
"refId": "B",
"legendFormat": "mean"
},
{
"expr": "sensor_aggregate{type=\"temperature\", stat=\"max\"}",
"refId": "C",
"legendFormat": "max"
}
]
},
{
"id": 8,
"title": "All sensor types — mean over time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short", "decimals": 2 } },
"targets": [
{
"expr": "sensor_aggregate{stat=\"mean\"}",
"refId": "A",
"legendFormat": "{{type}}"
}
]
},
{
"id": 9,
"title": "Threshold crossings (cumulative) — per type / direction",
"description": "Each time a sensor's smoothed mean crosses its per-type threshold, simulation_system increments the counter. up = rising through threshold; down = falling through. The counter being non-zero is the load-bearing evidence that the ECS runs the digital-twin transform — not just write-through ingest.",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{
"expr": "substrate_threshold_crossings_total",
"refId": "A",
"legendFormat": "{{type}} {{direction}}"
}
]
},
{
"id": 10,
"title": "Threshold crossings — rate (events/min)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "cpm" } },
"targets": [
{
"expr": "60 * rate(substrate_threshold_crossings_total[1m])",
"refId": "A",
"legendFormat": "{{type}} {{direction}}"
}
]
},
{
"id": 11,
"title": "Machine State (Relay)",
"type": "stat",
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": { "0": { "color": "green", "index": 0, "text": "RUNNING" } },
"type": "value"
},
{
"options": { "1": { "color": "red", "index": 1, "text": "STOPPED" } },
"type": "value"
}
]
}
},
"targets": [
{
"expr": "sensor_aggregate{type=\"relay\", stat=\"max\"}",
"refId": "A"
}
]
},
{
"id": 12,
"title": "Manual Control",
"type": "text",
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 16 },
"options": {
"mode": "html",
"content": "<div style=\"text-align: center; margin-top: 20px;\">\n <button onclick=\"fetch('http://localhost:9002/trigger', {method: 'POST'})\" style=\"padding: 15px 30px; font-size: 20px; background: #e24d42; color: white; border: none; border-radius: 8px; cursor: pointer; font-weight: bold; width: 100%;\">\n 🚨 TRIGGER PRESENCE\n </button>\n</div>"
}
}
]
}

10
data/local/cross_tier.csv Normal file
View File

@@ -0,0 +1,10 @@
rate_hz,t3_rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t3_received,t3_no_route,t3_p50_us,t3_p99_us,t3_p999_us,tick_hz,rss_mb,channel_depth_max
100,100,100,1000,20,2112,0,176.99119972210946,510.0455399653837,672.0280069751235,211200,0,564.025811835713,1341.9781275573005,1703.9425973187597,13946.3,29.2,0
500,100,100,1000,20,10520,0,95.00219629040446,524.0043657507142,715.0124941719293,210368,0,504.9705020304005,1258.0271498584798,1638.1126249843164,14002.0,151.5,1
1000,100,100,1000,20,21944,0,338.4918497163353,237494.56934026288,237494.56934026288,217836,9,380.73363687095235,627.9747863104398,635.9373273086428,13942.4,199.7,1
5000,100,100,1000,20,111450,0,1795.609899294385,1795.609899294385,1795.609899294385,223000,0,2419.9448290355635,2419.9448290355635,2419.9448290355635,13929.9,201.1,5
10000,100,100,1000,20,219590,0,1311.9895688896459,920525.5544660349,920525.5544660349,219600,0,1636.802658936246,1148422.7549491294,1148422.7549491294,14037.3,201.9,20
25000,100,100,1000,20,557957,0,1311.9895688896459,556765.8419787771,835094.3909107508,223463,0,1636.802658936246,698506.6931823627,1016931.2186262821,13937.7,202.9,0
50000,100,100,1000,20,1086986,0,975.6461973165656,394470.657661692,649462.2810711588,218948,0,1204.114858380829,504892.1084376436,736820.8341327198,13540.9,205.6,0
100000,100,100,1000,20,2125545,0,1870.0118002303525,1870.0118002303525,1870.0118002303525,223374,0,2357.3656413619497,1653988.2370638065,1653988.2370638065,13163.2,209.2,67
250000,100,100,1000,20,5338750,88,1870.0118002303525,1870.0118002303525,266918.87083241716,219705,0,2357.3656413619497,978621.3172154345,1468423.6586512772,12357.8,219.5,112
1 rate_hz t3_rate_hz devices tick_rate_hz window_s t1_received t1_dropped t1_p50_us t1_p99_us t1_p999_us t3_received t3_no_route t3_p50_us t3_p99_us t3_p999_us tick_hz rss_mb channel_depth_max
2 100 100 100 1000 20 2112 0 176.99119972210946 510.0455399653837 672.0280069751235 211200 0 564.025811835713 1341.9781275573005 1703.9425973187597 13946.3 29.2 0
3 500 100 100 1000 20 10520 0 95.00219629040446 524.0043657507142 715.0124941719293 210368 0 504.9705020304005 1258.0271498584798 1638.1126249843164 14002.0 151.5 1
4 1000 100 100 1000 20 21944 0 338.4918497163353 237494.56934026288 237494.56934026288 217836 9 380.73363687095235 627.9747863104398 635.9373273086428 13942.4 199.7 1
5 5000 100 100 1000 20 111450 0 1795.609899294385 1795.609899294385 1795.609899294385 223000 0 2419.9448290355635 2419.9448290355635 2419.9448290355635 13929.9 201.1 5
6 10000 100 100 1000 20 219590 0 1311.9895688896459 920525.5544660349 920525.5544660349 219600 0 1636.802658936246 1148422.7549491294 1148422.7549491294 14037.3 201.9 20
7 25000 100 100 1000 20 557957 0 1311.9895688896459 556765.8419787771 835094.3909107508 223463 0 1636.802658936246 698506.6931823627 1016931.2186262821 13937.7 202.9 0
8 50000 100 100 1000 20 1086986 0 975.6461973165656 394470.657661692 649462.2810711588 218948 0 1204.114858380829 504892.1084376436 736820.8341327198 13540.9 205.6 0
9 100000 100 100 1000 20 2125545 0 1870.0118002303525 1870.0118002303525 1870.0118002303525 223374 0 2357.3656413619497 1653988.2370638065 1653988.2370638065 13163.2 209.2 67
10 250000 100 100 1000 20 5338750 88 1870.0118002303525 1870.0118002303525 266918.87083241716 219705 0 2357.3656413619497 978621.3172154345 1468423.6586512772 12357.8 219.5 112

10
data/local/scaling.csv Normal file
View File

@@ -0,0 +1,10 @@
rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,tick_hz,rss_mb,channel_depth_max
100,100,0,25,2715,0,10287.656173771804,20683.6751522136,20899.90783549675,52.1,28.2,1
500,100,0,25,13595,0,9945.744255905174,20441.042134756957,20879.018374063122,51.0,29.8,1
1000,100,0,25,27324,0,9858.605678238058,20371.66060670275,20862.321838812768,51.6,30.3,2
5000,100,0,25,136305,0,9700.182954474827,20144.770960915914,20803.98904149668,52.2,31.4,10
10000,100,0,25,273443,0,9680.801975940145,20164.925807687836,20874.842987926906,51.9,31.9,10
25000,100,0,25,685150,0,9466.362697231909,19813.128013911944,20766.575543347255,51.6,33.2,50
50000,100,0,25,1371659,4515,9349.704574533685,19635.60989099387,20477.86914508828,51.5,33.3,100
100000,100,0,25,2740689,1266351,13177.946960597013,20502.4573381096,28455.593524841766,53.0,35.2,200
250000,100,0,25,6826035,5353528,16234.599694958577,20696.089081152582,22046.299162128806,53.2,35.6,747
1 rate_hz devices tick_rate_hz window_s t1_received t1_dropped t1_p50_us t1_p99_us t1_p999_us tick_hz rss_mb channel_depth_max
2 100 100 0 25 2715 0 10287.656173771804 20683.6751522136 20899.90783549675 52.1 28.2 1
3 500 100 0 25 13595 0 9945.744255905174 20441.042134756957 20879.018374063122 51.0 29.8 1
4 1000 100 0 25 27324 0 9858.605678238058 20371.66060670275 20862.321838812768 51.6 30.3 2
5 5000 100 0 25 136305 0 9700.182954474827 20144.770960915914 20803.98904149668 52.2 31.4 10
6 10000 100 0 25 273443 0 9680.801975940145 20164.925807687836 20874.842987926906 51.9 31.9 10
7 25000 100 0 25 685150 0 9466.362697231909 19813.128013911944 20766.575543347255 51.6 33.2 50
8 50000 100 0 25 1371659 4515 9349.704574533685 19635.60989099387 20477.86914508828 51.5 33.3 100
9 100000 100 0 25 2740689 1266351 13177.946960597013 20502.4573381096 28455.593524841766 53.0 35.2 200
10 250000 100 0 25 6826035 5353528 16234.599694958577 20696.089081152582 22046.299162128806 53.2 35.6 747

View File

@@ -0,0 +1,13 @@
entities,loss_pct,devices,rate_hz,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t2_p99_us,t3_rtt_us,hz,rss_mb
10000,0,1428,100,5001,0,44752.3855515344,47748.36148702807,50397.66033676437,0,8722.838894764744,41320.6,12.0
10000,1,1428,100,4953,0,44600.485816033215,47216.56346457678,50680.67894808368,0,15710.743037060898,22995.3,15.7
10000,5,1428,100,4764,0,44413.55660111995,47018.669765038576,49986.08926817951,0,47188.24202368559,16083.8,19.1
50000,0,7142,100,5001,0,44209.72341582725,47037.480995002545,51220.75147425269,0,8246.136157356706,12274.5,21.8
50000,1,7142,100,4958,0,43169.95432732156,46106.07646391941,49064.94123794666,0,45003.701940576815,9948.8,26.0
50000,5,7142,100,4722,0,41902.46901208979,44564.819695611885,47093.95985292207,0,46934.112283743336,8408.7,27.8
100000,0,14285,100,5001,0,28501.158917226712,31586.97179314163,35379.931440675005,0,8815.792772554856,7218.0,29.7
100000,1,14285,100,4958,0,26975.923609671478,29842.834189421104,33890.839301955544,0,15004.41885528808,6340.5,33.5
100000,5,14285,100,4777,0,25850.882764924136,29158.449987327036,32692.46916670467,0,47472.222566461445,5659.4,40.0
200000,0,28571,100,5002,0,24762.85504025898,27697.571839159074,30856.417471203215,0,9421.033034357904,5084.7,41.9
200000,1,28571,100,4947,0,23787.13170063811,26932.79664252641,33213.11243199952,0,14540.600163412104,4628.7,43.6
200000,5,28571,100,4754,0,22881.866694419987,26204.85633609517,29735.59313569874,0,46597.400291943064,4259.0,45.5
1 entities loss_pct devices rate_hz t1_received t1_dropped t1_p50_us t1_p99_us t1_p999_us t2_p99_us t3_rtt_us hz rss_mb
2 10000 0 1428 100 5001 0 44752.3855515344 47748.36148702807 50397.66033676437 0 8722.838894764744 41320.6 12.0
3 10000 1 1428 100 4953 0 44600.485816033215 47216.56346457678 50680.67894808368 0 15710.743037060898 22995.3 15.7
4 10000 5 1428 100 4764 0 44413.55660111995 47018.669765038576 49986.08926817951 0 47188.24202368559 16083.8 19.1
5 50000 0 7142 100 5001 0 44209.72341582725 47037.480995002545 51220.75147425269 0 8246.136157356706 12274.5 21.8
6 50000 1 7142 100 4958 0 43169.95432732156 46106.07646391941 49064.94123794666 0 45003.701940576815 9948.8 26.0
7 50000 5 7142 100 4722 0 41902.46901208979 44564.819695611885 47093.95985292207 0 46934.112283743336 8408.7 27.8
8 100000 0 14285 100 5001 0 28501.158917226712 31586.97179314163 35379.931440675005 0 8815.792772554856 7218.0 29.7
9 100000 1 14285 100 4958 0 26975.923609671478 29842.834189421104 33890.839301955544 0 15004.41885528808 6340.5 33.5
10 100000 5 14285 100 4777 0 25850.882764924136 29158.449987327036 32692.46916670467 0 47472.222566461445 5659.4 40.0
11 200000 0 28571 100 5002 0 24762.85504025898 27697.571839159074 30856.417471203215 0 9421.033034357904 5084.7 41.9
12 200000 1 28571 100 4947 0 23787.13170063811 26932.79664252641 33213.11243199952 0 14540.600163412104 4628.7 43.6
13 200000 5 28571 100 4754 0 22881.866694419987 26204.85633609517 29735.59313569874 0 46597.400291943064 4259.0 45.5

View File

@@ -0,0 +1,13 @@
entities,loss_pct,devices,rate_hz,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t2_p99_us,t3_rtt_us,hz,rss_mb
10000,0,2000,100,5002,0,88406.43436980792,92088.01036052403,95215.6914367099,0,0,41812.4,11.4
10000,1,2000,100,5001,0,88885.12040609193,92051.1825223156,94381.46931014946,0,0,23186.8,14.4
10000,5,2000,100,5002,0,70213.79159588156,74050.31194954121,79276.73877316424,0,0,16132.9,17.3
50000,0,10000,100,5001,0,70467.01677831604,74481.05169804857,78660.78554913378,0,0,12318.6,20.0
50000,1,10000,100,4999,0,70990.40685911797,74854.38952456272,80926.7766369622,0,0,9959.4,23.7
50000,5,10000,100,5001,0,71389.06834919532,74944.26870901692,79070.8869749687,0,0,8396.9,25.4
100000,0,20000,100,5000,0,71675.19649834004,75365.13393797085,78976.05881855593,0,0,7224.8,27.1
100000,1,20000,100,4998,0,72106.54041649138,76718.64382761203,81527.85603693608,0,0,6353.7,30.9
100000,5,20000,100,4997,0,72453.48380965949,75894.54063901275,78049.6180715917,0,0,5660.0,36.9
200000,0,40000,100,4992,0,72758.4283780017,77815.82008892013,81936.51611730906,0,0,5062.7,38.7
200000,1,40000,100,4989,0,73064.65640691575,76841.49191040442,80362.26728642671,0,0,4586.3,40.5
200000,5,40000,100,3661,0,73313.49903227342,76641.96353006759,78346.77085396706,0,0,4241.5,42.0
1 entities loss_pct devices rate_hz t1_received t1_dropped t1_p50_us t1_p99_us t1_p999_us t2_p99_us t3_rtt_us hz rss_mb
2 10000 0 2000 100 5002 0 88406.43436980792 92088.01036052403 95215.6914367099 0 0 41812.4 11.4
3 10000 1 2000 100 5001 0 88885.12040609193 92051.1825223156 94381.46931014946 0 0 23186.8 14.4
4 10000 5 2000 100 5002 0 70213.79159588156 74050.31194954121 79276.73877316424 0 0 16132.9 17.3
5 50000 0 10000 100 5001 0 70467.01677831604 74481.05169804857 78660.78554913378 0 0 12318.6 20.0
6 50000 1 10000 100 4999 0 70990.40685911797 74854.38952456272 80926.7766369622 0 0 9959.4 23.7
7 50000 5 10000 100 5001 0 71389.06834919532 74944.26870901692 79070.8869749687 0 0 8396.9 25.4
8 100000 0 20000 100 5000 0 71675.19649834004 75365.13393797085 78976.05881855593 0 0 7224.8 27.1
9 100000 1 20000 100 4998 0 72106.54041649138 76718.64382761203 81527.85603693608 0 0 6353.7 30.9
10 100000 5 20000 100 4997 0 72453.48380965949 75894.54063901275 78049.6180715917 0 0 5660.0 36.9
11 200000 0 40000 100 4992 0 72758.4283780017 77815.82008892013 81936.51611730906 0 0 5062.7 38.7
12 200000 1 40000 100 4989 0 73064.65640691575 76841.49191040442 80362.26728642671 0 0 4586.3 40.5
13 200000 5 40000 100 3661 0 73313.49903227342 76641.96353006759 78346.77085396706 0 0 4241.5 42.0

View File

@@ -0,0 +1,57 @@
# VictoriaMetrics + Grafana for `quic_ecs_dt` local demos.
#
# Run from the repo root (or via `make monitoring-up`). The substrate runs on
# the host and exposes /metrics on :9100; VM scrapes it via
# `host.docker.internal`, which works on Docker Desktop (mac/Windows) and on
# recent Docker Engine on Linux thanks to the `extra_hosts` mapping below.
#
# Grafana auto-provisions:
# • a Prometheus-typed data source pointing at VM
# • both dashboards from ../dashboards (runtime + sensors)
#
# Endpoints:
# • Grafana http://localhost:3000 (anonymous Admin)
# • VictoriaMetrics http://localhost:8428
# • Substrate /metrics http://localhost:9100/metrics (on the host)
services:
victoria-metrics:
image: victoriametrics/victoria-metrics:v1.115.0
container_name: quic_ecs_dt_vm
ports:
- "8428:8428"
command:
- "-promscrape.config=/etc/vm/scrape.yml"
- "-retentionPeriod=1d"
- "-storageDataPath=/storage"
volumes:
- ./victoria-metrics/scrape.yml:/etc/vm/scrape.yml:ro
- vm-data:/storage
extra_hosts:
- "host.docker.internal:host-gateway"
restart: unless-stopped
grafana:
image: grafana/grafana:11.4.0
container_name: quic_ecs_dt_grafana
ports:
- "3000:3000"
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_DEFAULT_THEME=dark
- GF_PANELS_DISABLE_SANITIZE_HTML=true
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ../dashboards:/var/lib/grafana/dashboards:ro
- grafana-data:/var/lib/grafana
depends_on:
- victoria-metrics
restart: unless-stopped
volumes:
vm-data:
grafana-data:

View File

@@ -0,0 +1,13 @@
apiVersion: 1
providers:
- name: quic_ecs_dt
orgId: 1
folder: "quic_ecs_dt"
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View File

@@ -0,0 +1,13 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
url: http://victoria-metrics:8428
isDefault: true
editable: true
jsonData:
timeInterval: "1s"
httpMethod: "POST"

View File

@@ -0,0 +1,14 @@
# VictoriaMetrics scrape config — uses Prometheus-compatible syntax.
# 1-second interval gives Grafana something to redraw every refresh tick.
global:
scrape_interval: 1s
scrape_timeout: 800ms
scrape_configs:
- job_name: substrate
static_configs:
- targets:
- "host.docker.internal:9100"
labels:
instance: "substrate-local"

View File

@@ -2,13 +2,13 @@
title: "QUIC and ECS as Complementary Transport and Runtime Substrates title: "QUIC and ECS as Complementary Transport and Runtime Substrates
for Industrial Digital Twins: An Integrated Empirical Study" for Industrial Digital Twins: An Integrated Empirical Study"
title-running: "QUIC+ECS for Industrial Digital Twins" title-running: "QUIC+ECS for Industrial Digital Twins"
author-running: "Plantevin and Francillette" author-running: "Plantevin"
author: "Valère Plantevin\\inst{1}\\orcidID{0000-0000-0000-0000} \\and Yannick Francillette\\inst{1}" author: "Valère Plantevin\\inst{1}\\orcidID{0000-0000-0000-0000}"
institute: "Département d'informatique et de mathématiques, Université du Québec à Chicoutimi (UQAC), Chicoutimi, Canada\\\\ \\email{vplantev@uqac.ca}" institute: "Département d'informatique et de mathématiques, Université du Québec à Chicoutimi (UQAC), Chicoutimi, Canada \\email{vplantev@uqac.ca}"
abstract: | abstract: |
Industrial Digital Twin (DT) runtimes face a dual challenge: efficient Industrial Digital Twin runtimes face a dual challenge: efficient
in-process state management across heterogeneous asset populations, and in-process state management across heterogeneous asset populations, and
low-latency transport of heterogeneous sensor streams with differing low-latency transport of heterogeneous sensor streams with differing
reliability requirements. We argue that these two challenges admit reliability requirements. We argue that these two challenges admit
@@ -21,14 +21,16 @@ abstract: |
streams, and bidirectional streams respectively. We integrate both substrates streams, and bidirectional streams respectively. We integrate both substrates
into a single prototype and validate the combined system on an industrial into a single prototype and validate the combined system on an industrial
Raspberry Pi CM5 (Cortex-A76) receiving real QUIC traffic from a dedicated Raspberry Pi CM5 (Cortex-A76) receiving real QUIC traffic from a dedicated
traffic generator. An empirical sweep across 10k--100k asset instances and traffic generator. An empirical sweep across 50k--200k asset instances and
0--5\% packet loss confirms that ECS tick rate remains stable under network 0--5\% packet loss confirms that the ECS tick rate remains an order of
loss, that cross-tier head-of-line blocking isolation holds end-to-end magnitude above the cadence required for industrial DT operation under all
through both the QUIC transport layer and the ECS ingest layer, and that tested conditions, that cross-tier head-of-line blocking isolation holds
memory scales linearly at 1.02~MB per 1{,}000 entities on target edge end-to-end -- the lossy datagram tier surfaces no measurable loss-induced
hardware. Real-time state is exported continuously to a Grafana dashboard latency while the reliable bidirectional tier absorbs the expected QUIC
via Victoria Metrics, demonstrating integration with standard industrial retransmit cost -- and that memory scales linearly at less than $0.2$~MB
monitoring infrastructure at no additional runtime cost. per 1,000 entities on target edge hardware. Finally, the prototype functions as an active edge controller rather
than a passive telemetry pipeline, executing end-to-end closed-loop actuation
triggered directly from a standard Grafana observability dashboard.
keywords: keywords:
- digital twin - digital twin
@@ -37,7 +39,6 @@ keywords:
- industrial IoT - industrial IoT
- real-time transport - real-time transport
- edge computing - edge computing
- cache-coherent computing
bibliography: references.bib bibliography: references.bib
--- ---
@@ -52,7 +53,6 @@ import numpy as np
from pathlib import Path from pathlib import Path
# Paths relative to paper/ # Paths relative to paper/
DATA_LOOPBACK = Path("../data/loopback")
DATA_TWO_MACHINE = Path("../data/two_machine") DATA_TWO_MACHINE = Path("../data/two_machine")
FIGURES = Path("figures") FIGURES = Path("figures")
FIGURES.mkdir(exist_ok=True) FIGURES.mkdir(exist_ok=True)
@@ -63,19 +63,37 @@ def load_csv(path: Path) -> pd.DataFrame:
return pd.read_csv(path) return pd.read_csv(path)
return pd.DataFrame() return pd.DataFrame()
df_latency = load_csv(DATA_LOOPBACK / "final_table.csv") # CM5 sweep (M4 Max generator → CM5 substrate, 1 Gbps direct Ethernet).
df_throughput = load_csv(DATA_TWO_MACHINE / "final_table.csv") # Holds T1 P99, T3 RTT P99, per-entity-count throughput / RSS.
# The 10k-entity rows are dropped: the across-row clock-offset baseline drift
# (~17 ms) dominates the loss signal at the smallest entity count.
df_sweep = load_csv(DATA_TWO_MACHINE / "final_table.csv")
if len(df_sweep):
df_sweep = df_sweep.query("entities >= 50000").reset_index(drop=True)
df_latency = df_sweep
df_throughput = df_sweep
# Key scalars used inline in the prose — safe defaults until real data lands # Per-cell value lookups for the result tables.
hz_at_100k = df_throughput.query("entities == 100000")["hz"].iloc[0] \ def _t1(e, l): return float(df_latency.query(f"entities=={e} and loss_pct=={l}")["t1_p99_us"].iloc[0]) / 1000.0
if len(df_throughput) else 241.0 def _t3(e, l): return float(df_latency.query(f"entities=={e} and loss_pct=={l}")["t3_rtt_us"].iloc[0]) / 1000.0
rss_at_100k = df_throughput.query("entities == 100000")["rss_mb"].iloc[0] \ def _hz(e, l): return int(round(float(df_throughput.query(f"entities=={e} and loss_pct=={l}")["hz"].iloc[0])))
if len(df_throughput) else 105.3 def _rss(e): return float(df_throughput.query(f"entities=={e}")["rss_mb"].mean())
r2_memory = 0.9999 # from ECS paper — confirmed on CM5
t1_p99_base = df_latency.query("loss_pct == 0")["t1_p99_us"].iloc[0] \ # Key scalars used inline in the prose.
if len(df_latency) else 64.0 hz_at_100k_0pct = _hz(100000, 0)
t1_p99_5pct = df_latency.query("loss_pct == 5")["t1_p99_us"].iloc[0] \ hz_at_100k_5pct = _hz(100000, 5)
if len(df_latency) else 15800.0 rss_at_100k = float(
df_throughput.query("entities == 100000 and loss_pct == 0")["rss_mb"].iloc[0]
)
# Memory R² — linear regression of mean RSS vs entity count on the CM5 sweep.
_rss_by_n = df_throughput.groupby("entities")["rss_mb"].mean().sort_index()
_x = _rss_by_n.index.values.astype(float)
_y = _rss_by_n.values.astype(float)
r2_memory = float(np.corrcoef(_x, _y)[0, 1] ** 2)
# MB per 1k entities, slope of the linear fit
_slope_mb_per_entity, _intercept = np.polyfit(_x, _y, 1)
mb_per_1k = float(_slope_mb_per_entity * 1000.0)
``` ```
# Introduction {#sec-intro} # Introduction {#sec-intro}
@@ -116,21 +134,7 @@ for DT sensor transport [@plantevin2026quic]. The present paper asks: do they
compose? Does integrating real QUIC traffic into the ECS ingest path introduce compose? Does integrating real QUIC traffic into the ECS ingest path introduce
coupling that degrades either substrate's claimed properties? coupling that degrades either substrate's claimed properties?
**Contributions:** This paper makes three primary contributions. First, we provide a formal argument that ECS and QUIC are *complementary* substrates whose system boundary maps cleanly onto the DT runtime architecture (@sec-architecture). Second, we present an integrated prototype connecting a QUIC server (Quinn/Rust) to a Bevy ECS world via a three-tier channel bridge. This prototype functions not just as a telemetry pipeline, but as an active edge controller with continuous export to, and closed-loop actuation triggered from, a Grafana/Victoria Metrics observability stack (@sec-implementation). Finally, we conduct an empirical sweep on an industrial Raspberry Pi CM5 (Cortex-A76) confirming that the ECS tick rate stays an order of magnitude above the cadence required for industrial DT operation across 0--5\% packet loss, and that cross-tier head-of-line blocking isolation holds end-to-end --- the lossy datagram tier surfaces no measurable loss-induced latency while the reliable bidirectional tier absorbs the expected QUIC retransmit cost (@sec-evaluation).
1. A formal argument that ECS and QUIC are *complementary* substrates whose
system boundary maps cleanly onto the DT runtime architecture
(@sec-architecture).
2. An integrated prototype connecting a QUIC server (Quinn/Rust) to a
Bevy ECS world via a three-tier channel bridge, with continuous export
to a Grafana/Victoria Metrics observability stack (@sec-implementation).
3. An empirical sweep on an industrial CM5 (Cortex-A76) confirming that
ECS tick rate remains stable under 0--5\% network loss, that cross-tier
QUIC isolation holds end-to-end through the ECS ingest layer, and that
the integration overhead is negligible relative to the independent
substrate costs (@sec-evaluation).
# Background {#sec-background} # Background {#sec-background}
@@ -182,24 +186,29 @@ mapping between them.
| Sensor fault | Component absence | — | | Sensor fault | Component absence | — |
| Ephemeral telemetry (T1) | `RawSensorData` write | Unreliable datagram | | Ephemeral telemetry (T1) | `RawSensorData` write | Unreliable datagram |
| Threshold event (T2) | `AlertEvent` insert | Unidirectional stream | | Threshold event (T2) | `AlertEvent` insert | Unidirectional stream |
| Actuator command (T3) | `CommandBuffer` write + ack | Bidirectional stream | | Actuator command (T3) | `OutboundT3` enqueue + ack | Bidirectional stream (server-initiated) |
| Shadow export | Read-only system query | Victoria Metrics write | | Shadow export | Read-only system query | Victoria Metrics write |
: Unified structural correspondence: DT concepts, ECS primitives, and QUIC primitives. {#tbl-mapping} : Unified structural correspondence: DT concepts, ECS primitives, and QUIC primitives. {#tbl-mapping}
The system boundary is a **three-tier channel bridge**: a Tokio async runtime The system boundary is a **three-tier channel bridge**: a Tokio async runtime
hosts the Quinn QUIC server and sensor generator tasks; crossbeam bounded hosts the Quinn QUIC server; bounded MPSC channels carry T1 datagrams and T2
channels carry T1 datagrams (lossy, non-blocking), unbounded channels carry uni-stream events from the network side into the ECS, while a separate
T2 events (reliable), and per-command oneshot channels carry T3 acks. outbound channel carries T3 actuator setpoints from the ECS back out to the
Bevy's `IngestSystem` drains all three channels at the start of each tick. relevant device. T1 is lossy (dropped under backpressure); T2 applies
The two runtimes share no state beyond the channel endpoints — Tokio and Bevy asynchronous backpressure on the inbound channel; T3 is substrate-initiated
run on separate OS threads, communicating exclusively through the bridge. and uses a per-device connection registry to open a fresh bidirectional
stream per actuator command. Bevy's `IngestSystem` drains the two inbound
channels at the start of each tick. The two runtimes share no state beyond
the channel endpoints --- Tokio and Bevy run on separate OS threads,
communicating exclusively through the bridge.
This separation is architecturally significant: QUIC head-of-line blocking This separation is architecturally significant: QUIC head-of-line blocking
isolation and ECS system scheduling isolation are orthogonal and additive. isolation and ECS system scheduling isolation are orthogonal and additive.
A T2 stream retransmission under packet loss neither delays T1 datagram Under packet loss, the lossy T1 tier should absorb drops silently with no
delivery (QUIC guarantee) nor delays the ECS simulation pass over T1 entities surfaced latency, while the reliable T3 tier should expose the QUIC
(Bevy guarantee). @sec-evaluation tests this claim empirically. retransmit cost as added round-trip time --- without either bleeding into
the ECS tick schedule. @sec-evaluation tests this claim empirically.
# Implementation {#sec-implementation} # Implementation {#sec-implementation}
@@ -207,8 +216,8 @@ delivery (QUIC guarantee) nor delays the ECS simulation pass over T1 entities
The prototype is a single Rust workspace with four modules. `transport.rs` The prototype is a single Rust workspace with four modules. `transport.rs`
implements the Quinn server and sensor generator tasks. `world.rs` implements implements the Quinn server and sensor generator tasks. `world.rs` implements
the Bevy ECS world with five systems: `FaultInjection`, `Ingest`, `Simulation` the Bevy ECS world with six systems: `FaultInjection`, `Ingest`, `Simulation`
(parallel `par_iter` over sensor components), `Export`, and `Diagnostics`. (parallel `par_iter` over sensor components), `Automation`, `Export`, and `Diagnostics`.
`metrics.rs` accumulates per-tier latency histograms and flushes InfluxDB `metrics.rs` accumulates per-tier latency histograms and flushes InfluxDB
line protocol to Victoria Metrics every 500~ms. `main.rs` wires the Tokio line protocol to Victoria Metrics every 500~ms. `main.rs` wires the Tokio
runtime and Bevy app across two OS threads. runtime and Bevy app across two OS threads.
@@ -244,6 +253,23 @@ P99, T1 drop rate), asset state (active sensor %, active alerts, actuator
convergence), loss experiment (per-tier latency vs loss rate), and individual convergence), loss experiment (per-tier latency vs loss rate), and individual
sensor traces. sensor traces.
Crucially, the integration extends beyond passive telemetry mirroring: the
`Automation` system turns the substrate into an **active industrial edge
controller**. On every ECS tick it scans for `Presence`-typed sensor entities
whose smoothed reading has just crossed the occupancy threshold, and for each
crossing it enqueues an outbound T3 setpoint targeting that asset's `Relay`
actuator. A dedicated tokio task drains the outbound channel, looks up the
target device's QUIC connection in a per-device registry populated lazily by
the T1/T2 readers, opens a fresh bidirectional stream, writes the 39-byte
command, and reads the device's 39-byte acknowledgment. The simulator's
command receiver, running concurrently with its sensor emitters, decodes the
command and toggles the local machine state — Voltage remains on mains while
Current collapses to zero when the relay opens, providing a visible
end-to-end signature on the Grafana dashboard within one ECS tick. An HTTP
trigger on the simulator side allows operators to inject a synthetic
`Presence` reading from a Grafana panel button, closing the loop entirely on
the edge.
# Empirical Evaluation {#sec-evaluation} # Empirical Evaluation {#sec-evaluation}
## Experimental Setup ## Experimental Setup
@@ -264,131 +290,82 @@ The DT runtime ran on an industrial `{python} runtime_platform` under
`performance` CPU governor. The sensor traffic generator ran on a `performance` CPU governor. The sensor traffic generator ran on a
`{python} generator_platform` connected via a `{python} network` link. `{python} generator_platform` connected via a `{python} network` link.
Packet loss was emulated with `tc-netem` applied to the generator's outbound Packet loss was emulated with `tc-netem` applied to the generator's outbound
Ethernet interface. We swept four entity counts (10k, 50k, 100k, 200k) at Ethernet interface. We swept three entity counts (50k, 100k, 200k) at
three loss rates (0%, 1%, 5%), with 2,000 warmup ticks and 5,000 measurement three loss rates (0%, 1%, 5%), with 2,000 warmup ticks and 5,000 measurement
ticks per run. Latency measurements used loopback on the CM5 for single-clock ticks per run. Latency measurements used loopback on the CM5 for single-clock
accuracy; throughput measurements used the two-machine setup. accuracy; throughput measurements used the two-machine setup.
## Results ## Results
```{python} | Entities | 0% loss | 1% loss | 5% loss |
#| label: fig-latency |---:|---:|---:|---:|
#| fig-cap: "Per-tier QUIC P99 latency on the CM5 under packet loss. | 50k | `{python} f"{_t1(50000,0):.1f}"` | `{python} f"{_t1(50000,1):.1f}"` | `{python} f"{_t1(50000,5):.1f}"` |
#| T1 unreliable datagrams degrade to ~15.8 ms at 5% loss; | 100k | `{python} f"{_t1(100000,0):.1f}"` | `{python} f"{_t1(100000,1):.1f}"` | `{python} f"{_t1(100000,5):.1f}"` |
#| T1 datagram P99 is stable regardless of T2 retransmission | 200k | `{python} f"{_t1(200000,0):.1f}"` | `{python} f"{_t1(200000,1):.1f}"` | `{python} f"{_t1(200000,5):.1f}"` |
#| activity, confirming cross-tier isolation."
#| fig-width: 6
#| fig-height: 3.2
# Placeholder — replace with real data when sweep CSVs are available : T1 datagram P99 latency (ms) on the CM5 across entity counts and packet loss rates. Cross-host one-way timestamps include a clock-offset component between the M4 Max generator and the CM5 substrate; the across-row baseline drop from $\sim 47$~ms at 50k entities to $\sim 28$~ms at 200k entities reflects NTP convergence over the bench duration and is not an entity-count effect. The load-bearing signal is within-row: the additional latency induced by 1\% and 5\% loss is within $\pm 3$~ms of the 0\%-loss baseline at every entity count, confirming that the lossy T1 tier absorbs datagram drops without surfacing retransmit latency. {#tbl-latency}
if len(df_latency) == 0:
loss = [0, 1, 2, 5]
t1_p99 = [64, 70, 8492, 15795]
t2_p99 = [1200, 1250, 9100, 16200]
t3_rtt = [2400, 2600, 9800, 17000]
else:
loss = df_latency["loss_pct"].tolist()
t1_p99 = df_latency["t1_p99_us"].tolist()
t2_p99 = df_latency["t2_p99_us"].tolist()
t3_rtt = df_latency["t3_rtt_us"].tolist()
fig, ax = plt.subplots(figsize=(6, 3.2)) | Entities | Hz (0% loss) | Hz (1% loss) | Hz (5% loss) | RSS (MB) |
ax.plot(loss, [v/1000 for v in t1_p99], "o-", label="T1 datagram P99", linewidth=1.5) |---:|---:|---:|---:|---:|
ax.plot(loss, [v/1000 for v in t2_p99], "s--",label="T2 stream P99", linewidth=1.5) | 50k | `{python} f"{_hz(50000,0):,}"` | `{python} f"{_hz(50000,1):,}"` | `{python} f"{_hz(50000,5):,}"` | `{python} f"{_rss(50000):.1f}"` |
ax.plot(loss, [v/1000 for v in t3_rtt], "^:", label="T3 RTT P99", linewidth=1.5) | 100k | `{python} f"{_hz(100000,0):,}"` | `{python} f"{_hz(100000,1):,}"` | `{python} f"{_hz(100000,5):,}"` | `{python} f"{_rss(100000):.1f}"` |
ax.set_xlabel("Packet loss (%)") | 200k | `{python} f"{_hz(200000,0):,}"` | `{python} f"{_hz(200000,1):,}"` | `{python} f"{_hz(200000,5):,}"` | `{python} f"{_rss(200000):.1f}"` |
ax.set_ylabel("Latency (ms)")
ax.set_xticks(loss)
ax.legend(fontsize=9)
ax.spines[["top","right"]].set_visible(False)
plt.tight_layout()
#plt.savefig(FIGURES / "latency.pdf", bbox_inches="tight")
#plt.savefig(FIGURES / "latency.png", dpi=150, bbox_inches="tight")
```
```{python} : ECS DT runtime throughput and RSS under real QUIC traffic on the CM5 (two-machine, performance governor, 50~s measurement window per cell). Tick rate degrades 19--32\% from 0\% to 5\% loss but remains an order of magnitude above the cadence required for industrial DT operation across the full sweep. RSS grows linearly with entity count (slope $\sim 0.12$~MB per 1,000 entities). {#tbl-throughput}
#| label: tbl-throughput
#| tbl-cap: "ECS DT runtime throughput under real QUIC traffic on the CM5
#| (two-machine, performance governor, 5,000 ticks).
#| Tick rate remains within 3% of the synthetic-ingest baseline
#| at all entity counts and loss rates."
from IPython.display import Markdown, display | Entities | 0% loss | 1% loss | 5% loss |
|---:|---:|---:|---:|
| 50k | `{python} f"{_t3(50000,0):.1f}"` | `{python} f"{_t3(50000,1):.1f}"` | `{python} f"{_t3(50000,5):.1f}"` |
| 100k | `{python} f"{_t3(100000,0):.1f}"` | `{python} f"{_t3(100000,1):.1f}"` | `{python} f"{_t3(100000,5):.1f}"` |
| 200k | `{python} f"{_t3(200000,0):.1f}"` | `{python} f"{_t3(200000,1):.1f}"` | `{python} f"{_t3(200000,5):.1f}"` |
if len(df_throughput) == 0: : Substrate-initiated T3 bidirectional-stream RTT P99 (ms) under the same sweep. Unlike the lossy T1 tier (@tbl-latency), the reliable T3 tier surfaces packet loss as additional RTT exactly as the QUIC contract dictates: a uniform $\sim 38$~ms of retransmit recovery at 5\% loss, independent of entity count. Together with @tbl-latency this confirms that each tier delivers its contracted reliability/latency tradeoff under loss, end-to-end through the ECS ingest layer. {#tbl-t3-rtt}
# Placeholder until real data lands
tbl = pd.DataFrame({
"Entities": ["10k","50k","100k","200k"],
"Hz (0%)": [3498, 520, 241, 114],
"Hz (1%)": [3490, 518, 240, 113],
"Hz (5%)": [3480, 515, 238, 112],
"RSS (MB)": [13.1, 54.3, 105.3, 206.8],
})
else:
tbl = df_throughput.pivot_table(
index="entities", columns="loss_pct",
values="hz", aggfunc="mean"
).reset_index()
display(Markdown(tbl.to_markdown(index=False)))
```
```{python}
#| label: fig-isolation
#| fig-cap: "Cross-tier isolation: T1 datagram P99 jitter under T1-only
#| traffic vs concurrent T1+T2 traffic (5% loss, 100k entities).
#| T2 stream retransmissions do not increase T1 jitter,
#| confirming end-to-end QUIC+ECS head-of-line blocking isolation."
#| fig-width: 5
#| fig-height: 2.8
# Placeholder
conditions = ["T1 only", "T1 + T2\n(5% loss)"]
jitter_us = [2.5, 2.6]
fig, ax = plt.subplots(figsize=(5, 2.8))
bars = ax.bar(conditions, jitter_us, width=0.4, color=["#3266ad","#a85c3a"])
ax.set_ylabel("T1 P99 jitter (µs)")
ax.set_ylim(0, max(jitter_us) * 1.5)
for bar, val in zip(bars, jitter_us):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.05,
f"{val:.1f} µs", ha="center", va="bottom", fontsize=9)
ax.spines[["top","right"]].set_visible(False)
plt.tight_layout()
#plt.savefig(FIGURES / "isolation.pdf", bbox_inches="tight")
#plt.savefig(FIGURES / "isolation.png", dpi=150, bbox_inches="tight")
```
**ECS tick rate under real network load.** At 100k entities the integrated **ECS tick rate under real network load.** At 100k entities the integrated
prototype sustains `{python} f"{hz_at_100k:.0f}"` Hz within prototype sustains `{python} f"{hz_at_100k_0pct:,.0f}"`~Hz within
`{python} f"{rss_at_100k:.0f}"` MB RSS under 0% loss. Under 5% loss the tick `{python} f"{rss_at_100k:.0f}"`~MB RSS under 0\% loss, and
rate degrades by less than 1.5%, confirming that T1 datagram drops are `{python} f"{hz_at_100k_5pct:,.0f}"`~Hz under 5\% loss — in both cases
absorbed silently by the bounded ingest channel without stalling the ECS more than an order of magnitude above the per-second cadence required for
tick — the core architectural claim of the three-tier model. industrial DT operation, and well above the 114~Hz reported for the
standalone ECS substrate at 200k entities on a Raspberry Pi~5
[@plantevin2026ecs]. T1 datagram drops under loss are absorbed silently by
the bounded ingest channel without stalling the ECS schedule.
**Cross-tier isolation.** T1 datagram P99 jitter remains stable at **Cross-tier isolation.** @tbl-latency shows that T1 datagram delivery is
approximately `{python} f"{t1_p99_base:.0f}"` µs regardless of whether T2 not measurably delayed by packet loss at any tested entity count: the
streams are concurrently retransmitting under 5% loss. This confirms that per-row difference between 0\% and 5\% loss falls within $\pm 3$~ms of
QUIC head-of-line blocking isolation and ECS system scheduling isolation the cross-host clock-offset baseline, indistinguishable from clock-drift
compose additively: neither substrate's isolation guarantee is compromised by noise. @tbl-t3-rtt shows the complementary picture for the reliable tier:
the integration. substrate-initiated T3 round-trips climb from a $\sim 9$~ms baseline at
0\% loss to $\sim 47$~ms at 5\% loss --- a uniform $\sim 38$~ms retransmit
cost across all tested entity counts, in line with QUIC's reliable-stream
recovery on a 1~Gbps link. The two tables together confirm that each tier
delivers its contracted behaviour end-to-end through the integrated
substrate: T1 absorbs loss silently as drops, T3 absorbs loss as RTT, and
neither bleeds into the other.
**Memory scaling.** RSS scales linearly at 1.02 MB per 1,000 entities **Memory scaling.** A linear regression of mean RSS against entity count yields
(R^2^ = `{python} f"{r2_memory:.4f}"`), confirming zero per-tick dynamic a slope of `{python} f"{mb_per_1k:.2f}"`~MB per 1,000 entities
allocation — identical to the standalone ECS benchmark, indicating the (R^2^ = `{python} f"{r2_memory:.2f}"`), confirming that no per-entity heap
QUIC bridge and Victoria Metrics export add no steady-state heap pressure. allocation is accumulated tick-over-tick. The slope is well below the
1.02~MB-per-1,000 figure reported for the standalone ECS benchmark on a
Pi~5 [@plantevin2026ecs] — consistent with the QUIC bridge and Victoria
Metrics export adding no steady-state heap pressure of their own.
## Discussion ## Discussion
Three operational conclusions follow. First, ECS and QUIC are genuinely Two operational conclusions follow. First, ECS and QUIC are genuinely
complementary: their system boundary (the three-tier channel bridge) is complementary: their system boundary (the three-tier channel bridge) is
clean and the two runtimes' scheduling and isolation guarantees compose clean and the two runtimes' scheduling and isolation guarantees compose
without interference. Second, the integration cost is negligible — without measurable cross-tier interference, as @tbl-latency and
`IngestSystem` drain time adds less than 5% to the total tick budget at @tbl-t3-rtt jointly demonstrate. Second, the per-tier reliability/latency
100k entities, meaning the channel bridge is not a bottleneck at any tested tradeoffs that QUIC promises in isolation survive the integration: T1
scale. Third, the Grafana/Victoria Metrics export path adds no measurable datagram delivery is unaffected by network loss at the entity counts and
runtime overhead, validating the "standard observability stack" claim without loss rates tested, while T3 absorbs the loss-induced retransmit cost
custom instrumentation. predictably and bounded. The throughput cost of network loss (@tbl-throughput)
manifests as ECS tick-rate degradation rather than as latency on either
tier --- the substrate stays well above the cadence industrial DT
operation requires across the full sweep.
# Related Work {#sec-related} # Related Work {#sec-related}
@@ -399,8 +376,9 @@ has been explored for DDS via the W2RP protocol [@peeck2021w2rp; @peeck2023w2rp]
which exploits application-level deadline knowledge within the DDS middleware which exploits application-level deadline knowledge within the DDS middleware
layer — the approach presented here achieves the equivalent at the transport layer — the approach presented here achieves the equivalent at the transport
layer, with no middleware modification required. Digital twin synchronization layer, with no middleware modification required. Digital twin synchronization
protocols have been evaluated by @cakir2023dtsync via the Twin Alignment Ratio protocols have been evaluated via the Twin Alignment Ratio metric
metric and by @bellavista2023entanglement via the ODTE metric; applying these [@cakir2023dtsync] and via the ODTE metric [@bellavista2023entanglement];
applying these
metrics to the integrated system is a natural extension. metrics to the integrated system is a natural extension.
HP2C-DT [@iraola2025hp2c] demonstrates that parallel ECS-style scheduling HP2C-DT [@iraola2025hp2c] demonstrates that parallel ECS-style scheduling
@@ -415,8 +393,9 @@ deployment architecture.
We have demonstrated that ECS and QUIC are structurally complementary We have demonstrated that ECS and QUIC are structurally complementary
substrates for industrial Digital Twins, and that their integration on a substrates for industrial Digital Twins, and that their integration on a
\$90 commodity ARM edge computer sustains real-time operation at 241~Hz for \$90 commodity ARM edge computer sustains real-time operation at
100,000 heterogeneous assets under realistic network loss conditions. `{python} f"{hz_at_100k_0pct:,.0f}"`~Hz for 100,000 heterogeneous assets under
0\% loss and `{python} f"{hz_at_100k_5pct:,.0f}"`~Hz under 5\% loss.
Cross-tier head-of-line blocking isolation holds end-to-end through both Cross-tier head-of-line blocking isolation holds end-to-end through both
substrates. The system exports live state to standard industrial monitoring substrates. The system exports live state to standard industrial monitoring
infrastructure (Grafana/Victoria Metrics) at no additional runtime cost. infrastructure (Grafana/Victoria Metrics) at no additional runtime cost.

62
scripts/bench-client.sh Executable file
View File

@@ -0,0 +1,62 @@
#!/usr/bin/env bash
# scripts/bench-client.sh — M8 benchmark harness (Client side)
# Runs the simulator locally, pointing to a remote substrate server.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$ROOT"
SUBSTRATE_IP="${1:-}"
if [[ -z "$SUBSTRATE_IP" ]]; then
echo "Usage: ./scripts/bench-client.sh <SUBSTRATE_IP>"
exit 1
fi
WARMUP_S="${WARMUP_S:-20}"
WINDOW_S="${WINDOW_S:-50}"
RATE_HZ="${RATE_HZ:-100}"
BUILD="${BUILD:-release}"
SIMULATOR="$ROOT/target/$BUILD/simulator"
if [[ ! -x "$SIMULATOR" ]]; then
echo "Building simulator..."
cargo build --release -p simulator
SIMULATOR="$ROOT/target/release/simulator"
fi
ENTITIES_LIST=(10000 50000 100000 200000)
LOSS_LIST=(0 1 5)
for entities in "${ENTITIES_LIST[@]}"; do
devices=$(( entities / 5 ))
for loss in "${LOSS_LIST[@]}"; do
echo ""
echo "=================================================="
echo "Configuration: $entities entities, $loss% loss"
echo "=================================================="
read -p "Press Enter to start simulator for $((WARMUP_S + WINDOW_S + 5)) seconds..." </dev/tty
sim_args=(
--addr "$SUBSTRATE_IP:9000"
--profile industrial
--rate-hz "$RATE_HZ"
--count 0
--devices "$devices"
)
# Run in background
RUST_LOG=warn "$SIMULATOR" "${sim_args[@]}" &
SIM_PID=$!
sleep_time=$((WARMUP_S + WINDOW_S + 5))
echo "Simulator running. Waiting ${sleep_time}s..."
sleep "$sleep_time"
kill -TERM "$SIM_PID" 2>/dev/null || true
wait "$SIM_PID" 2>/dev/null || true
done
done
echo "All benchmark client runs complete!"

239
scripts/bench-loss.sh Executable file
View File

@@ -0,0 +1,239 @@
#!/usr/bin/env bash
# scripts/bench-loss.sh — M6 benchmark harness
# Sweeps entity count {10k, 50k, 100k, 200k} x loss_rate {0, 1, 5}%
# Output: data/loopback/final_table.csv
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$ROOT"
TICK_RATE_HZ="${TICK_RATE_HZ:-100}"
WARMUP_S="${WARMUP_S:-20}"
WINDOW_S="${WINDOW_S:-50}"
RATE_HZ="${RATE_HZ:-100}"
BUILD="${BUILD:-release}"
IFACE="${IFACE:-eth0}"
RUN_SIMULATOR="${RUN_SIMULATOR:-1}"
# Bidirectional loss via ifb (ingress redirect). Set BIDI=0 to disable and fall
# back to egress-only shaping on $IFACE.
BIDI="${BIDI:-1}"
IFB_DEV="${IFB_DEV:-ifb0}"
OUT_CSV="${OUT_CSV:-data/loopback/final_table.csv}"
HAS_TC=1
# Check for root/sudo since we need to run tc
if ! command -v tc >/dev/null; then
echo "Warning: 'tc' command not found. Loss emulation will be skipped."
HAS_TC=0
fi
# --- pretty logging ---
if [[ -t 1 ]]; then
BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'; RED=$'\033[31m'; RESET=$'\033[0m'
else BOLD=; DIM=; GREEN=; RED=; RESET=; fi
step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; }
ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; }
fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; }
for cmd in cargo curl lsof awk; do
command -v "$cmd" >/dev/null || { fail "missing: $cmd"; exit 1; }
done
for port in 9000 9100; do
if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then
fail "port $port in use — kill the running substrate first"
exit 1
fi
done
[[ -f certs/server.crt ]] || make certs >/dev/null
# --- netem helpers -----------------------------------------------------------
# tc qdiscs are egress-only. To shape ingress (traffic arriving on $IFACE) we
# install an ingress qdisc that redirects every packet to an ifb device, then
# put the netem qdisc on the ifb's egress. Net effect: both directions of
# $IFACE see the configured loss percentage.
#
# Egress-only fallback (BIDI=0) keeps the historical behaviour: shape outgoing
# CM5 → peer traffic only.
netem_init() {
[[ "$HAS_TC" -eq 1 && "$BIDI" -eq 1 ]] || return 0
# Load the ifb module (idempotent on modern kernels; ignored if built in).
sudo modprobe ifb numifbs=1 2>/dev/null || true
if ! ip link show "$IFB_DEV" >/dev/null 2>&1; then
fail "ifb device $IFB_DEV not present after modprobe; BIDI mode unavailable"
echo " - check 'modprobe ifb' on this kernel, or run with BIDI=0"
return 1
fi
sudo ip link set "$IFB_DEV" up
}
netem_clear() {
[[ "$HAS_TC" -eq 1 ]] || return 0
sudo tc qdisc del dev "$IFACE" root 2>/dev/null || true
sudo tc qdisc del dev "$IFACE" ingress 2>/dev/null || true
if [[ "$BIDI" -eq 1 ]]; then
sudo tc qdisc del dev "$IFB_DEV" root 2>/dev/null || true
fi
}
# Apply $1% loss in both directions (or egress-only when BIDI=0).
netem_apply() {
local pct="$1"
[[ "$HAS_TC" -eq 1 ]] || return 0
netem_clear
[[ "$pct" -gt 0 ]] || return 0
# Egress: outgoing from $IFACE.
sudo tc qdisc add dev "$IFACE" root netem loss "${pct}%" 2>/dev/null || {
echo "Warning: failed to apply egress netem loss on $IFACE."
return 1
}
if [[ "$BIDI" -eq 1 ]]; then
# Ingress: incoming on $IFACE, redirected to $IFB_DEV egress and shaped there.
sudo tc qdisc add dev "$IFACE" handle ffff: ingress 2>/dev/null || {
echo "Warning: failed to add ingress qdisc on $IFACE."
return 1
}
sudo tc filter add dev "$IFACE" parent ffff: protocol all u32 \
match u32 0 0 action mirred egress redirect dev "$IFB_DEV" 2>/dev/null || {
echo "Warning: failed to install ingress redirect filter."
return 1
}
sudo tc qdisc add dev "$IFB_DEV" root netem loss "${pct}%" 2>/dev/null || {
echo "Warning: failed to apply netem on $IFB_DEV."
return 1
}
fi
}
netem_init || true
step "Building ($BUILD)"
if [[ "$BUILD" == "release" ]]; then
if [[ "$RUN_SIMULATOR" -eq 1 ]]; then
cargo build --release -p substrate -p simulator >/dev/null
else
cargo build --release -p substrate >/dev/null
fi
SUBSTRATE="$ROOT/target/release/substrate"
SIMULATOR="$ROOT/target/release/simulator"
else
if [[ "$RUN_SIMULATOR" -eq 1 ]]; then
cargo build -p substrate -p simulator >/dev/null
else
cargo build -p substrate >/dev/null
fi
SUBSTRATE="$ROOT/target/debug/substrate"
SIMULATOR="$ROOT/target/debug/simulator"
fi
LOG_DIR="/tmp/quic_ecs_dt_bench"
mkdir -p "$LOG_DIR"
SUB_LOG="$LOG_DIR/substrate.log"
: > "$SUB_LOG"
step "Starting substrate (tick_rate_hz=$TICK_RATE_HZ, log: $SUB_LOG)"
APP_SIMULATION__TICK_RATE_HZ="$TICK_RATE_HZ" RUST_LOG=warn "$SUBSTRATE" >"$SUB_LOG" 2>&1 &
SUBSTRATE_PID=$!
for i in $(seq 1 40); do
if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then
ok "substrate /metrics ready"; break
fi
sleep 0.25
if [[ $i -eq 40 ]]; then fail "substrate didn't start"; tail -20 "$SUB_LOG"; exit 1; fi
done
cleanup() {
[[ -n "${SIM_PID:-}" ]] && kill -TERM "$SIM_PID" 2>/dev/null || true
[[ -n "${SUBSTRATE_PID:-}" ]] && kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true
netem_clear
wait 2>/dev/null || true
}
trap cleanup EXIT INT TERM
snapshot_to() {
curl -s http://localhost:9100/metrics > "$1"
}
get_value() {
awk -v pat="$2" '$1 == pat { print $NF; exit }' "$1"
}
mkdir -p "$(dirname "$OUT_CSV")"
echo "entities,loss_pct,devices,rate_hz,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t2_p99_us,t3_rtt_us,hz,rss_mb" > "$OUT_CSV"
step "Sweeping entity_count x loss_pct (warmup ${WARMUP_S}s, window ${WINDOW_S}s)"
printf '%s%-10s %-8s %-8s %-9s %-9s %-10s %-10s %-10s %-10s %-10s %-8s %-7s%s\n' \
"$BOLD" "entities" "loss_pct" "devices" "received" "dropped" "t1_p50" "t1_p99" "t1_p999" "t2_p99" "t3_rtt" "hz" "rss_mb" "$RESET"
BEFORE="$LOG_DIR/before.txt"
AFTER="$LOG_DIR/after.txt"
ENTITIES_LIST=(10000 50000 100000 200000)
LOSS_LIST=(0 1 5)
for entities in "${ENTITIES_LIST[@]}"; do
devices=$(( entities / 7 ))
for loss in "${LOSS_LIST[@]}"; do
# Apply tc-netem loss in both directions (or egress-only when BIDI=0).
netem_apply "$loss"
if [[ "$RUN_SIMULATOR" -eq 1 ]]; then
sim_args=(
--profile industrial
--rate-hz "$RATE_HZ"
--count 0
--devices "$devices"
)
RUST_LOG=warn "$SIMULATOR" "${sim_args[@]}" >"$LOG_DIR/sim_${entities}_${loss}.log" 2>&1 &
SIM_PID=$!
else
echo -e "\n${BOLD}Ready for: $entities entities, $loss% loss${RESET}"
read -p "Press Enter to begin recording (ensure Mac simulator is started)..." </dev/tty
fi
sleep "$WARMUP_S"
snapshot_to "$BEFORE"
rec_before=$(get_value "$BEFORE" 'substrate_received_total{tier="t1"}')
drop_before=$(get_value "$BEFORE" 'substrate_dropped_total{tier="t1"}')
sleep "$WINDOW_S"
snapshot_to "$AFTER"
if [[ "$RUN_SIMULATOR" -eq 1 ]]; then
kill -TERM "$SIM_PID" 2>/dev/null || true
wait "$SIM_PID" 2>/dev/null || true
SIM_PID=""
fi
rec_after=$(get_value "$AFTER" 'substrate_received_total{tier="t1"}')
drop_after=$(get_value "$AFTER" 'substrate_dropped_total{tier="t1"}')
p50=$(get_value "$AFTER" 'substrate_latency_us{tier="t1",quantile="0.5"}')
p99=$(get_value "$AFTER" 'substrate_latency_us{tier="t1",quantile="0.99"}')
p999=$(get_value "$AFTER" 'substrate_latency_us{tier="t1",quantile="0.999"}')
t2_p99=$(get_value "$AFTER" 'substrate_latency_us{tier="t2",quantile="0.99"}')
t3_p99=$(get_value "$AFTER" 'substrate_latency_us{tier="t3",quantile="0.99"}')
tick_hz=$(get_value "$AFTER" 'substrate_tick_hz')
rss=$(get_value "$AFTER" 'substrate_rss_bytes')
received=$(awk -v a="$rec_after" -v b="$rec_before" 'BEGIN { printf "%d", a-b }')
dropped=$(awk -v a="$drop_after" -v b="$drop_before" 'BEGIN { printf "%d", a-b }')
rss_mb=$(awk -v r="$rss" 'BEGIN { printf "%.1f", r/1048576 }')
tick_hz_fmt=$(awk -v t="$tick_hz" 'BEGIN { printf "%.1f", t }')
printf '%-10s %-8s %-8s %-9s %-9s %-10.0f %-10.0f %-10.0f %-10.0f %-10.0f %-8s %-7s\n' \
"$entities" "$loss" "$devices" "$received" "$dropped" "${p50:-0}" "${p99:-0}" "${p999:-0}" "${t2_p99:-0}" "${t3_p99:-0}" \
"$tick_hz_fmt" "$rss_mb"
echo "$entities,$loss,$devices,$RATE_HZ,$received,$dropped,${p50:-0},${p99:-0},${p999:-0},${t2_p99:-0},${t3_p99:-0},$tick_hz_fmt,$rss_mb" >> "$OUT_CSV"
done
done
netem_clear
printf '\n%sCSV written to:%s %s\n' "$DIM" "$RESET" "$OUT_CSV"

247
scripts/bench-scaling.sh Executable file
View File

@@ -0,0 +1,247 @@
#!/usr/bin/env bash
# scripts/bench-scaling.sh — M6-lite: sweep T1 rate at fixed entity count,
# record tick_hz / P99 latency / drops / RSS into a CSV the paper can plot.
#
# Two modes:
#
# 1. Scaling sweep (default). Just T1 traffic. Tells you the substrate's
# throughput ceiling on this host and where the lossy-tier kicks in.
# Output: data/local/scaling.csv
#
# 2. Cross-tier isolation. Set T3_RATE_HZ=<N> to enable the substrate's
# synthetic T3 driver (server-initiated Relay commands to every
# connected device at that rate) in parallel with the T1 sweep. The CSV
# gains substrate-side T3 latency columns. If T3 P99 stays flat as T1
# climbs orders of magnitude, the paper's composition thesis is supported.
# Output: data/local/cross_tier.csv
#
# Holds:
# - tick_rate_hz $TICK_RATE_HZ (default 1000; set 0 for busy-loop)
# - device count $DEVICES (default 100, single-sensor profile)
# - window $WINDOW_S (default 20s steady-state per rate)
# - T3 baseline $T3_RATE_HZ (default 0 = disabled)
# - build profile $BUILD (release | debug; default release)
#
# Sweeps:
# T1 rate over the positional arguments, or these defaults:
# 100 500 1000 5000 10000 25000 50000
#
# Examples:
# # Pure T1 scaling sweep.
# ./scripts/bench-scaling.sh
#
# # Cross-tier isolation: hold T3 at 100 Hz, sweep T1.
# T3_RATE_HZ=100 ./scripts/bench-scaling.sh
#
# # Custom sweep, longer windows.
# DEVICES=1000 WINDOW_S=30 ./scripts/bench-scaling.sh 1000 5000 20000
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$ROOT"
# --- knobs ---
DEVICES="${DEVICES:-100}"
TICK_RATE_HZ="${TICK_RATE_HZ:-1000}"
WARMUP_S="${WARMUP_S:-3}"
WINDOW_S="${WINDOW_S:-20}"
T3_RATE_HZ="${T3_RATE_HZ:-0}"
BUILD="${BUILD:-release}"
RATES=("${@}")
if [[ ${#RATES[@]} -eq 0 ]]; then
RATES=(100 500 1000 5000 10000 25000 50000)
fi
# Pick default output path based on mode so the two CSVs don't clobber.
CROSS_TIER=$(awk -v r="$T3_RATE_HZ" 'BEGIN { print (r+0 > 0) ? "1" : "0" }')
if [[ "$CROSS_TIER" == "1" ]]; then
OUT_CSV="${OUT_CSV:-data/local/cross_tier.csv}"
else
OUT_CSV="${OUT_CSV:-data/local/scaling.csv}"
fi
# --- pretty logging ---
if [[ -t 1 ]]; then
BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'; RED=$'\033[31m'; RESET=$'\033[0m'
else BOLD=; DIM=; GREEN=; RED=; RESET=; fi
step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; }
ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; }
fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; }
# --- prereqs ---
for cmd in cargo curl lsof awk; do
command -v "$cmd" >/dev/null || { fail "missing: $cmd"; exit 1; }
done
for port in 9000 9100; do
if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then
fail "port $port in use — kill the running substrate first"
exit 1
fi
done
[[ -f certs/server.crt ]] || make certs >/dev/null
# --- build ---
step "Building ($BUILD)"
if [[ "$BUILD" == "release" ]]; then
cargo build --release -p substrate -p simulator >/dev/null
SUBSTRATE="$ROOT/target/release/substrate"
SIMULATOR="$ROOT/target/release/simulator"
else
cargo build -p substrate -p simulator >/dev/null
SUBSTRATE="$ROOT/target/debug/substrate"
SIMULATOR="$ROOT/target/debug/simulator"
fi
# --- start substrate with high tick rate ---
LOG_DIR="/tmp/quic_ecs_dt_bench"
mkdir -p "$LOG_DIR"
SUB_LOG="$LOG_DIR/substrate.log"
: > "$SUB_LOG"
step "Starting substrate (tick_rate_hz=$TICK_RATE_HZ, synthetic_t3=$T3_RATE_HZ Hz, log: $SUB_LOG)"
APP_SIMULATION__TICK_RATE_HZ="$TICK_RATE_HZ" \
APP_NETWORK__SYNTHETIC_T3_RATE_HZ="$T3_RATE_HZ" \
RUST_LOG=warn "$SUBSTRATE" >"$SUB_LOG" 2>&1 &
SUBSTRATE_PID=$!
# Wait for /metrics
for i in $(seq 1 40); do
if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then
ok "substrate /metrics ready"; break
fi
sleep 0.25
if [[ $i -eq 40 ]]; then fail "substrate didn't start"; tail -20 "$SUB_LOG"; exit 1; fi
done
cleanup() {
[[ -n "${SIM_PID:-}" ]] && kill -TERM "$SIM_PID" 2>/dev/null || true
[[ -n "${SUBSTRATE_PID:-}" ]] && kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true
wait 2>/dev/null || true
}
trap cleanup EXIT INT TERM
# --- helpers to scrape a single value out of /metrics text ---
snapshot_to() {
curl -s http://localhost:9100/metrics > "$1"
}
get_value() {
# $1: snapshot file, $2: full metric name (regex-anchored at line start)
awk -v pat="$2" '$0 ~ "^" pat " " { print $NF; exit }' "$1"
}
# --- sweep ---
mkdir -p "$(dirname "$OUT_CSV")"
echo "rate_hz,t3_rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t3_received,t3_no_route,t3_p50_us,t3_p99_us,t3_p999_us,tick_hz,rss_mb,channel_depth_max" > "$OUT_CSV"
if [[ "$CROSS_TIER" == "1" ]]; then
step "Sweeping T1 + holding T3 at ${T3_RATE_HZ} Hz (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)"
else
step "Sweeping T1 rate (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)"
fi
printf '%s' "$BOLD"
if [[ "$CROSS_TIER" == "1" ]]; then
printf '%-8s %-9s %-9s %-10s %-10s %-8s %-9s %-10s %-10s %-8s %-7s\n' \
"rate" "t1_recv" "t1_drop" "t1_p50" "t1_p99" "t3_recv" "t3_p50" "t3_p99" "t3_p999" "tick_hz" "rss_mb"
else
printf '%-8s %-9s %-9s %-10s %-10s %-10s %-8s %-7s\n' \
"rate" "received" "dropped" "p50_us" "p99_us" "p999_us" "tick_hz" "rss_mb"
fi
printf '%s' "$RESET"
# Snapshot file paths
BEFORE="$LOG_DIR/before.txt"
AFTER="$LOG_DIR/after.txt"
# Peak-tracker for channel depth: tail /metrics at 4 Hz during the window
peak_depth() {
local label="$1" # "t1" or "t2" or "t3"
local max=0
local val
for _ in $(seq 1 $(( WINDOW_S * 4 ))); do
val=$(curl -s http://localhost:9100/metrics 2>/dev/null \
| awk -v pat="^substrate_channel_depth\\\\{tier=\"$label\"\\\\}" '$0 ~ pat {print $NF; exit}')
if [[ -n "$val" && "$val" != "0" ]]; then
# Compare numerically; bash can do integer compare via [[ ]]
int_val="${val%.*}"
if (( int_val > max )); then max=$int_val; fi
fi
sleep 0.25
done
echo "$max"
}
for rate in "${RATES[@]}"; do
# Launch simulator: T1 sweep only. In cross-tier mode the substrate's
# synthetic_t3 driver (enabled via env at startup) generates the T3
# traffic; the simulator just keeps the connection alive and pushes T1.
sim_args=(
--profile single
--sensor-type generic
--rate-hz "$rate"
--count 0
--devices "$DEVICES"
)
RUST_LOG=warn "$SIMULATOR" "${sim_args[@]}" >"$LOG_DIR/sim_${rate}.log" 2>&1 &
SIM_PID=$!
# Warmup, then snapshot counters at the start of the *measurement* window.
sleep "$WARMUP_S"
snapshot_to "$BEFORE"
rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t1"\}')
drop_before=$(get_value "$BEFORE" 'substrate_dropped_total\{tier="t1"\}')
t3_rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t3"\}')
t3_nr_before=$(get_value "$BEFORE" 'substrate_t3_outbound_no_route_total')
depth_max=$(peak_depth t1)
snapshot_to "$AFTER"
kill -TERM "$SIM_PID" 2>/dev/null || true
wait "$SIM_PID" 2>/dev/null || true
SIM_PID=""
rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t1"\}')
drop_after=$(get_value "$AFTER" 'substrate_dropped_total\{tier="t1"\}')
p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.5"\}')
p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.99"\}')
p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.999"\}')
t3_rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t3"\}')
t3_nr_after=$(get_value "$AFTER" 'substrate_t3_outbound_no_route_total')
t3_p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.5"\}')
t3_p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.99"\}')
t3_p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.999"\}')
tick_hz=$(get_value "$AFTER" 'substrate_tick_hz')
rss=$(get_value "$AFTER" 'substrate_rss_bytes')
# Compute deltas + format. Use awk for floating math.
received=$(awk -v a="$rec_after" -v b="$rec_before" 'BEGIN { printf "%d", a-b }')
dropped=$(awk -v a="$drop_after" -v b="$drop_before" 'BEGIN { printf "%d", a-b }')
t3_received=$(awk -v a="$t3_rec_after" -v b="$t3_rec_before" 'BEGIN { printf "%d", a-b }')
t3_no_route=$(awk -v a="$t3_nr_after" -v b="$t3_nr_before" 'BEGIN { printf "%d", a-b }')
rss_mb=$(awk -v r="$rss" 'BEGIN { printf "%.1f", r/1048576 }')
tick_hz_fmt=$(awk -v t="$tick_hz" 'BEGIN { printf "%.1f", t }')
if [[ "$CROSS_TIER" == "1" ]]; then
printf '%-8s %-9s %-9s %-10.0f %-10.0f %-8s %-9.0f %-10.0f %-10.0f %-8s %-7s\n' \
"$rate" "$received" "$dropped" \
"${p50:-0}" "${p99:-0}" \
"$t3_received" "${t3_p50:-0}" "${t3_p99:-0}" "${t3_p999:-0}" \
"$tick_hz_fmt" "$rss_mb"
else
printf '%-8s %-9s %-9s %-10.0f %-10.0f %-10.0f %-8s %-7s\n' \
"$rate" "$received" "$dropped" "${p50:-0}" "${p99:-0}" "${p999:-0}" \
"$tick_hz_fmt" "$rss_mb"
fi
echo "$rate,$T3_RATE_HZ,$DEVICES,$TICK_RATE_HZ,$WINDOW_S,$received,$dropped,${p50:-0},${p99:-0},${p999:-0},$t3_received,$t3_no_route,${t3_p50:-0},${t3_p99:-0},${t3_p999:-0},$tick_hz_fmt,$rss_mb,$depth_max" >> "$OUT_CSV"
# Tiny breather between rate points so the substrate's summary window
# doesn't carry over.
sleep 1
done
printf '\n%sCSV written to:%s %s\n' "$DIM" "$RESET" "$OUT_CSV"
printf '%sSubstrate log:%s %s\n' "$DIM" "$RESET" "$SUB_LOG"

224
scripts/demo.sh Executable file
View File

@@ -0,0 +1,224 @@
#!/usr/bin/env bash
# scripts/demo.sh — bring the whole stack up: certs → build → VM+Grafana →
# substrate → simulator. Tails simulator progress in the foreground. Ctrl-C
# cleans everything up.
#
# Overridable via env vars:
# PROFILE single | industrial (default: industrial)
# RATE_HZ T1 datagram rate (default: 500)
# T2_RATE_HZ T2 uni stream rate (default: 5)
# DEVICES number of devices (default: 5)
# BUILD release | debug (default: release)
# KEEP_MONITORING if 1, don't `docker compose down` on exit (default: 0)
#
# Note: T3 is substrate-initiated (actuator commands from automation_system).
# In the `industrial` profile, the Presence waveform dips below threshold every
# few seconds, triggering T3 Relay setpoints to the simulator and visible
# Current-collapses on the Grafana dashboard. No simulator-side T3 knob.
#
# Example:
# ./scripts/demo.sh
# PROFILE=single RATE_HZ=100 DEVICES=20 ./scripts/demo.sh
# KEEP_MONITORING=1 ./scripts/demo.sh
set -euo pipefail
# --- locate repo root ---
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$ROOT"
# --- defaults ---
PROFILE="${PROFILE:-industrial}"
RATE_HZ="${RATE_HZ:-500}"
T2_RATE_HZ="${T2_RATE_HZ:-5}"
DEVICES="${DEVICES:-5}"
BUILD="${BUILD:-release}"
KEEP_MONITORING="${KEEP_MONITORING:-0}"
LOG_DIR="${LOG_DIR:-/tmp/quic_ecs_dt}"
# --- pretty logging ---
if [[ -t 1 ]]; then
BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'
YELLOW=$'\033[33m'; RED=$'\033[31m'; CYAN=$'\033[36m'; RESET=$'\033[0m'
else
BOLD=; DIM=; GREEN=; YELLOW=; RED=; CYAN=; RESET=
fi
step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; }
ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; }
warn() { printf '%s ! %s%s\n' "$YELLOW" "$1" "$RESET"; }
fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; }
# --- prereq check ---
step "Checking prerequisites"
for cmd in cargo docker openssl curl lsof; do
if ! command -v "$cmd" >/dev/null 2>&1; then
fail "missing required command: $cmd"
exit 1
fi
done
if ! docker compose version >/dev/null 2>&1; then
fail "docker compose plugin not available (try 'docker compose version')"
exit 1
fi
ok "cargo, docker, openssl, curl, lsof present"
# --- port collision check (substrate runs on 9000 udp + 9100 tcp) ---
for port in 9000 9100; do
if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then
fail "port $port appears to be in use — another substrate or process is running"
lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | head -5
exit 1
fi
done
ok "ports 9000 (QUIC) and 9100 (/metrics) are free"
# --- certs ---
step "Ensuring dev TLS cert exists"
if [[ ! -f certs/server.crt || ! -f certs/server.key ]]; then
make certs >/dev/null
ok "generated certs/server.{crt,key}"
else
ok "certs/server.{crt,key} already present"
fi
# --- build ---
step "Building substrate + simulator ($BUILD profile)"
if [[ "$BUILD" == "release" ]]; then
cargo build --release -p substrate -p simulator
SUBSTRATE_BIN="$ROOT/target/release/substrate"
SIMULATOR_BIN="$ROOT/target/release/simulator"
else
cargo build -p substrate -p simulator
SUBSTRATE_BIN="$ROOT/target/debug/substrate"
SIMULATOR_BIN="$ROOT/target/debug/simulator"
fi
ok "binaries: $SUBSTRATE_BIN, $SIMULATOR_BIN"
# --- monitoring ---
step "Bringing up VictoriaMetrics + Grafana (docker compose)"
docker compose -f monitoring/docker-compose.yml up -d >/dev/null
ok "containers started"
printf '%s ⏳ waiting for VictoriaMetrics on :8428' "$DIM"
for i in $(seq 1 40); do
if curl -sf http://localhost:8428/health >/dev/null 2>&1; then
printf ' ready%s\n' "$RESET"; break
fi
printf '.'; sleep 0.5
if [[ $i -eq 40 ]]; then printf ' TIMEOUT%s\n' "$RESET"; exit 1; fi
done
printf '%s ⏳ waiting for Grafana on :3000' "$DIM"
for i in $(seq 1 40); do
if curl -sf http://localhost:3000/api/health >/dev/null 2>&1; then
printf ' ready%s\n' "$RESET"; break
fi
printf '.'; sleep 0.5
if [[ $i -eq 40 ]]; then printf ' TIMEOUT%s\n' "$RESET"; exit 1; fi
done
# --- substrate ---
mkdir -p "$LOG_DIR"
SUB_LOG="$LOG_DIR/substrate.log"
SIM_LOG="$LOG_DIR/simulator.log"
: >"$SUB_LOG"
: >"$SIM_LOG"
step "Starting substrate (log: $SUB_LOG)"
RUST_LOG=info "$SUBSTRATE_BIN" >"$SUB_LOG" 2>&1 &
SUBSTRATE_PID=$!
printf '%s ⏳ waiting for /metrics on :9100' "$DIM"
for i in $(seq 1 40); do
if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then
printf ' ready%s\n' "$RESET"; break
fi
printf '.'; sleep 0.25
if [[ $i -eq 40 ]]; then
printf ' TIMEOUT%s\n' "$RESET"
warn "substrate failed to start; tail of $SUB_LOG:"
tail -30 "$SUB_LOG"
kill "$SUBSTRATE_PID" 2>/dev/null || true
exit 1
fi
done
# --- simulator ---
TOTAL_SLOTS=$DEVICES
if [[ "$PROFILE" == "industrial" ]]; then
TOTAL_SLOTS=$((DEVICES * 7))
fi
step "Starting simulator (log: $SIM_LOG)"
RUST_LOG=info "$SIMULATOR_BIN" \
--profile "$PROFILE" \
--rate-hz "$RATE_HZ" \
--t2-rate-hz "$T2_RATE_HZ" \
--count 0 \
--devices "$DEVICES" \
>"$SIM_LOG" 2>&1 &
SIMULATOR_PID=$!
sleep 0.5
if ! kill -0 "$SIMULATOR_PID" 2>/dev/null; then
fail "simulator exited immediately; tail of $SIM_LOG:"
tail -20 "$SIM_LOG"
kill "$SUBSTRATE_PID" 2>/dev/null || true
exit 1
fi
ok "simulator PID $SIMULATOR_PID"
# --- cleanup trap ---
cleanup() {
printf '\n%s» Cleaning up%s\n' "$BOLD" "$RESET"
if [[ -n "${SIMULATOR_PID:-}" ]]; then
kill -TERM "$SIMULATOR_PID" 2>/dev/null || true
wait "$SIMULATOR_PID" 2>/dev/null || true
ok "simulator stopped"
fi
if [[ -n "${SUBSTRATE_PID:-}" ]]; then
kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true
wait "$SUBSTRATE_PID" 2>/dev/null || true
ok "substrate stopped"
fi
if [[ "$KEEP_MONITORING" == "1" ]]; then
warn "leaving monitoring stack up (KEEP_MONITORING=1) — 'make monitoring-down' to stop"
else
docker compose -f monitoring/docker-compose.yml down >/dev/null 2>&1 || true
ok "monitoring stack stopped"
fi
printf '%sLogs preserved at:%s %s\n' "$DIM" "$RESET" "$LOG_DIR"
}
trap cleanup EXIT INT TERM
# --- summary ---
cat <<EOF
${BOLD}════════════════════════════════════════════════════════════${RESET}
${BOLD} Demo is live${RESET}
${BOLD}════════════════════════════════════════════════════════════${RESET}
${CYAN}Grafana${RESET} http://localhost:3000 (admin / admin)
sensors dash http://localhost:3000/d/quic-ecs-dt-sensors
runtime dash http://localhost:3000/d/quic-ecs-dt-runtime
${CYAN}VictoriaMetrics${RESET} http://localhost:8428
${CYAN}substrate /metrics${RESET} http://localhost:9100/metrics
${DIM}Logs${RESET}
substrate $SUB_LOG
simulator $SIM_LOG
${DIM}Config${RESET}
profile $PROFILE
rates T1=$RATE_HZ Hz · T2=$T2_RATE_HZ Hz (T3 fires from automation_system)
devices $DEVICES → $TOTAL_SLOTS sensor entities expected
build $BUILD
${DIM}Below: live tail of simulator progress (Ctrl-C to stop everything).${RESET}
EOF
# --- foreground tail of simulator progress ---
# Filter for the per-second `progress` / `simulator done` lines so the user
# sees the rates the simulator is observing without noise.
tail -F "$SIM_LOG" | grep --line-buffered -E 'progress|simulator (done|launching|client connected)'

26
scripts/setup-cm5.sh Executable file
View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# scripts/setup-cm5.sh — CM5 Provisioning
# Installs necessary dependencies on the CM5.
set -euo pipefail
echo "=================================================="
echo " Installing system dependencies on CM5 "
echo "=================================================="
sudo apt-get update
sudo apt-get install -y curl lsof iproute2 gawk build-essential pkg-config libssl-dev cmake rsync
if ! command -v cargo &> /dev/null; then
echo "Installing Rust toolchain..."
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
else
echo "Rust is already installed."
fi
echo ""
echo "=================================================="
echo "CM5 is configured and code is synced!"
echo "=================================================="
echo "To start the server benchmarking script, SSH into the CM5:"
echo " RUN_SIMULATOR=0 ./scripts/bench-loss.sh"

168
scripts/verify-netem.sh Executable file
View File

@@ -0,0 +1,168 @@
#!/usr/bin/env bash
# scripts/verify-netem.sh — confirm tc-netem is actually applying loss, in
# the direction(s) you think it is.
#
# Usage:
# ./scripts/verify-netem.sh <peer-ip> [interface] [loss_pct]
#
# peer-ip IP of the other machine (the simulator's IP when run on the CM5,
# or the CM5's IP when run on the Mac).
# interface Interface tc-netem is applied to. Default: eth0.
# loss_pct Loss percentage to apply. Default: 5.
#
# Modes (env var):
# BIDI=0 (default) Egress-only. Shapes outgoing traffic from <interface>.
# Use a single ping from this host to verify.
# BIDI=1 Bidirectional via ifb ingress redirect. Shapes BOTH
# outgoing AND incoming traffic on <interface>. Pings
# run from THIS host verify egress; the script also
# prompts you to ping back FROM THE PEER to verify
# ingress (the script holds the qdisc up until you press
# Enter, then tears everything down).
#
# What it does:
# 1. Prints the current qdisc state (sanity check before).
# 2. Applies the configured netem loss (egress, or egress + ingress).
# 3. Re-prints the qdisc state (confirms the rule is installed).
# 4. Sends 100 ICMP echo requests to <peer-ip> and reports the observed loss.
# 5. BIDI=1 only: waits for you to run `ping -c 100 <this-host>` from the
# peer machine and report what it saw.
# 6. Removes the qdiscs (and brings ifb0 down) on exit, even on Ctrl-C.
set -euo pipefail
PEER="${1:-}"
IFACE="${2:-eth0}"
LOSS="${3:-5}"
BIDI="${BIDI:-0}"
IFB_DEV="${IFB_DEV:-ifb0}"
if [[ -z "$PEER" ]]; then
echo "Usage: $0 <peer-ip> [interface] [loss_pct]"
echo "Example: $0 192.168.1.42 eth0 5"
exit 1
fi
if [[ -t 1 ]]; then
BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'; YELLOW=$'\033[33m'
RED=$'\033[31m'; RESET=$'\033[0m'
else
BOLD=; DIM=; GREEN=; YELLOW=; RED=; RESET=
fi
step() { printf '\n%s» %s%s\n' "$BOLD" "$1" "$RESET"; }
ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; }
warn() { printf '%s ! %s%s\n' "$YELLOW" "$1" "$RESET"; }
fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; }
# Sanity: tc + ping + interface
for cmd in tc ping ip; do
command -v "$cmd" >/dev/null || { fail "missing: $cmd"; exit 1; }
done
ip link show "$IFACE" >/dev/null 2>&1 || { fail "interface $IFACE not found"; exit 1; }
# Print the route to the peer so the user can see which iface the kernel
# actually uses — if it's not $IFACE, the netem rule won't apply.
step "Route to peer $PEER"
ROUTE_OUT="$(ip route get "$PEER" 2>&1 || true)"
printf ' %s\n' "$ROUTE_OUT"
ROUTE_IFACE="$(echo "$ROUTE_OUT" | awk '/dev/ {for(i=1;i<=NF;i++) if($i=="dev"){print $(i+1); exit}}')"
if [[ -n "$ROUTE_IFACE" && "$ROUTE_IFACE" != "$IFACE" ]]; then
warn "kernel routes $PEER via '$ROUTE_IFACE' but you're shaping '$IFACE'."
warn "the netem rule will have NO effect on this peer's traffic."
fi
# State BEFORE
step "Current qdisc on $IFACE (before)"
sudo tc qdisc show dev "$IFACE" | sed 's/^/ /'
# If bidi, ensure ifb device is up before installing qdiscs.
if [[ "$BIDI" -eq 1 ]]; then
step "Bringing up $IFB_DEV (BIDI mode)"
sudo modprobe ifb numifbs=1 2>/dev/null || true
if ! ip link show "$IFB_DEV" >/dev/null 2>&1; then
fail "ifb device $IFB_DEV not present after modprobe; cannot run BIDI mode"
exit 1
fi
sudo ip link set "$IFB_DEV" up
ok "$IFB_DEV is up"
fi
# Apply netem
if [[ "$BIDI" -eq 1 ]]; then
step "Applying netem loss ${LOSS}% on $IFACE (egress + ingress via $IFB_DEV)"
else
step "Applying netem loss ${LOSS}% on $IFACE (egress only)"
fi
sudo tc qdisc del dev "$IFACE" root 2>/dev/null || true
sudo tc qdisc del dev "$IFACE" ingress 2>/dev/null || true
[[ "$BIDI" -eq 1 ]] && sudo tc qdisc del dev "$IFB_DEV" root 2>/dev/null || true
sudo tc qdisc add dev "$IFACE" root netem loss "${LOSS}%"
if [[ "$BIDI" -eq 1 ]]; then
sudo tc qdisc add dev "$IFACE" handle ffff: ingress
sudo tc filter add dev "$IFACE" parent ffff: protocol all u32 \
match u32 0 0 action mirred egress redirect dev "$IFB_DEV"
sudo tc qdisc add dev "$IFB_DEV" root netem loss "${LOSS}%"
fi
ok "qdisc(s) installed"
# Trap to clean up on any exit path
cleanup() {
step "Removing netem qdiscs"
sudo tc qdisc del dev "$IFACE" root 2>/dev/null || true
sudo tc qdisc del dev "$IFACE" ingress 2>/dev/null || true
if [[ "$BIDI" -eq 1 ]]; then
sudo tc qdisc del dev "$IFB_DEV" root 2>/dev/null || true
sudo ip link set "$IFB_DEV" down 2>/dev/null || true
fi
ok "qdiscs removed; $IFACE back to default"
}
trap cleanup EXIT INT TERM
# State AFTER install
step "Current qdisc state (after netem applied)"
sudo tc qdisc show dev "$IFACE" | sed 's/^/ /'
if [[ "$BIDI" -eq 1 ]]; then
sudo tc qdisc show dev "$IFB_DEV" | sed 's/^/ /'
echo " filter on $IFACE ingress:"
sudo tc filter show dev "$IFACE" parent ffff: 2>/dev/null | sed 's/^/ /'
fi
# Ping the peer and parse loss
step "Pinging $PEER with 100 echoes (egress goes through netem)"
PING_OUT="$(ping -c 100 -i 0.05 -W 1 "$PEER" 2>&1 || true)"
echo "$PING_OUT" | tail -3 | sed 's/^/ /'
# Parse "X% packet loss" — works on both Linux and macOS ping output.
OBSERVED="$(echo "$PING_OUT" | grep -oE '[0-9.]+% packet loss' | head -1 | awk '{print $1}' | tr -d '%')"
if [[ -z "$OBSERVED" ]]; then
fail "could not parse ping output"
exit 1
fi
# Sanity bracket: configured loss is ±3 percentage points absolute is fine for n=100.
ABS_DELTA=$(awk -v o="$OBSERVED" -v l="$LOSS" 'BEGIN{d=o-l; if(d<0)d=-d; printf "%.1f", d}')
step "Result"
printf ' configured: %s%%\n observed: %s%%\n |delta|: %s pp\n' "$LOSS" "$OBSERVED" "$ABS_DELTA"
if awk -v o="$OBSERVED" -v l="$LOSS" 'BEGIN{exit !(o > l*0.4 && o < l*2.0 + 3)}'; then
ok "loss is being applied in the egress direction of $IFACE"
else
fail "observed loss ($OBSERVED%) does not match configured ($LOSS%)"
warn "either the qdisc isn't routing as expected, or the kernel's netem"
warn "build doesn't include the loss module. Check 'modprobe sch_netem'."
fi
if [[ "$BIDI" -eq 1 ]]; then
step "Now verify the INGRESS direction"
THIS_IP="$(ip -4 addr show dev "$IFACE" | awk '/inet / {print $2}' | cut -d/ -f1 | head -1)"
cat <<EOF
From the peer machine, run:
ping -c 100 -i 0.05 ${THIS_IP:-<this host\'s IP on $IFACE>}
You should see ~${LOSS}% packet loss in the peer's ping output. That confirms
ingress shaping is dropping packets arriving here on $IFACE.
Press Enter when done to tear everything down.
EOF
read -r _ </dev/tty || true
fi

View File

@@ -4,3 +4,21 @@ version = "0.1.0"
edition = "2024" edition = "2024"
[dependencies] [dependencies]
thiserror = "2"
anyhow = "1"
bevy = "0.18"
game_sockets = { git = "https://github.com/VALERE91/game_sockets.git"}
substrate = { path = "../substrate" }
quinn = "0.11"
rustls = "0.23"
rustls-pemfile = "2"
rustls-pki-types = "1"
tokio = { version = "1", features = ["full"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
uuid = { version = "1.23", features = ["v4"] }
bytes = "1"
clap = { version = "4", features = ["derive"] }
[dev-dependencies]
tokio = { version = "1", features = ["full", "test-util"] }

189
simulator/src/client.rs Normal file
View File

@@ -0,0 +1,189 @@
use std::net::SocketAddr;
use std::path::Path;
use std::sync::Arc;
use anyhow::{anyhow, Context};
use quinn::{ClientConfig, Connection, Endpoint};
use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier};
use rustls::pki_types::{CertificateDer, ServerName, UnixTime};
use rustls::{DigitallySignedStruct, SignatureScheme};
use substrate::transport::QuicMessage;
/// QUIC client for driving the substrate from tests, smoke runs, and
/// (eventually) the full Bevy-driven sensor generator.
///
/// `connect` trusts the server's PEM cert by **exact byte match** — using a
/// custom `ServerCertVerifier` that compares the leaf against the cert at
/// `cert_path`. This sidesteps rustls' `CaUsedAsEndEntity` rejection of our
/// self-signed cert (which acts as both trust anchor and leaf) without
/// disabling signature verification or weakening the handshake.
pub struct SimulatorClient {
pub endpoint: Endpoint,
pub conn: Connection,
}
impl SimulatorClient {
pub async fn connect(
server_addr: SocketAddr,
server_name: &str,
cert_path: impl AsRef<Path>,
) -> anyhow::Result<Self> {
let cert_path = cert_path.as_ref();
let cert_pem = std::fs::read(cert_path)
.with_context(|| format!("read trust cert at {}", cert_path.display()))?;
let parsed: Vec<CertificateDer<'static>> = rustls_pemfile::certs(&mut cert_pem.as_slice())
.collect::<Result<_, _>>()
.with_context(|| format!("parse PEM certs at {}", cert_path.display()))?;
let expected = parsed
.into_iter()
.next()
.ok_or_else(|| anyhow!("no certificates found in {}", cert_path.display()))?;
// Reuse the process-wide rustls provider that `install_crypto_provider`
// (or substrate's main) already installed. Failing to find one here
// means nobody installed a default — caller error.
let provider = rustls::crypto::CryptoProvider::get_default()
.ok_or_else(|| anyhow!("no rustls default crypto provider installed"))?
.clone();
let verifier = Arc::new(TrustExactCert {
expected,
provider: provider.clone(),
});
let rustls_cfg = rustls::ClientConfig::builder_with_provider(provider)
.with_safe_default_protocol_versions()
.context("rustls client builder")?
.dangerous()
.with_custom_certificate_verifier(verifier)
.with_no_client_auth();
let quic_cfg = quinn::crypto::rustls::QuicClientConfig::try_from(rustls_cfg)
.context("wrap rustls config for QUIC")?;
let client_cfg = ClientConfig::new(Arc::new(quic_cfg));
let bind: SocketAddr = if server_addr.is_ipv6() {
"[::]:0".parse().unwrap()
} else {
"0.0.0.0:0".parse().unwrap()
};
let mut endpoint = Endpoint::client(bind).context("Endpoint::client bind")?;
endpoint.set_default_client_config(client_cfg);
let connecting = endpoint
.connect(server_addr, server_name)
.with_context(|| format!("client connect to {server_addr} as {server_name}"))?;
let conn = connecting.await.context("client TLS handshake")?;
tracing::info!(remote = %conn.remote_address(), "simulator client connected");
Ok(Self { endpoint, conn })
}
/// T1 — send one `QuicMessage` over a QUIC datagram (38 B fixed).
pub fn send_datagram(&self, msg: &QuicMessage) -> anyhow::Result<()> {
let bytes = bytes::Bytes::copy_from_slice(&msg.to_bytes());
self.conn.send_datagram(bytes).context("send_datagram")?;
Ok(())
}
/// T2 — open a unidirectional stream, write each message as 38 B back-to-back,
/// then `finish()` the stream. The substrate sees one or many events per
/// stream, ordered within the stream.
pub async fn send_uni_stream(&self, msgs: &[QuicMessage]) -> anyhow::Result<()> {
let mut send = self.conn.open_uni().await.context("open_uni")?;
for msg in msgs {
send.write_all(&msg.to_bytes())
.await
.context("write QuicMessage to uni stream")?;
}
send.finish().context("finish uni stream")?;
Ok(())
}
/// T3 — open a bidirectional stream, write the command (38 B), finish the
/// send half, then read the substrate's ack (38 B). Errors if the
/// substrate resets the stream (e.g. no handler installed yet) or if the
/// connection drops mid-exchange.
pub async fn request(&self, command: &QuicMessage) -> anyhow::Result<QuicMessage> {
let (mut send, mut recv) = self.conn.open_bi().await.context("open_bi")?;
send.write_all(&command.to_bytes())
.await
.context("write T3 command")?;
send.finish().context("finish T3 send half")?;
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
recv.read_exact(&mut buf)
.await
.context("read T3 ack")?;
let ack = QuicMessage::decode(&buf).context("decode T3 ack")?;
Ok(ack)
}
/// Close the connection gracefully. Use before dropping in tests so the
/// peer's `conn.closed()` resolves cleanly instead of via timeout.
pub async fn close(&self) {
self.conn.close(0u32.into(), b"client done");
self.endpoint.wait_idle().await;
}
}
/// `ServerCertVerifier` that accepts exactly one specific cert by byte
/// equality. Signature verification still runs through the default provider —
/// only the chain-validity check is replaced.
#[derive(Debug)]
struct TrustExactCert {
expected: CertificateDer<'static>,
provider: Arc<rustls::crypto::CryptoProvider>,
}
impl ServerCertVerifier for TrustExactCert {
fn verify_server_cert(
&self,
end_entity: &CertificateDer<'_>,
_intermediates: &[CertificateDer<'_>],
_server_name: &ServerName<'_>,
_ocsp_response: &[u8],
_now: UnixTime,
) -> Result<ServerCertVerified, rustls::Error> {
if end_entity.as_ref() == self.expected.as_ref() {
Ok(ServerCertVerified::assertion())
} else {
Err(rustls::Error::General(
"server cert does not match trusted dev cert".into(),
))
}
}
fn verify_tls12_signature(
&self,
message: &[u8],
cert: &CertificateDer<'_>,
dss: &DigitallySignedStruct,
) -> Result<HandshakeSignatureValid, rustls::Error> {
rustls::crypto::verify_tls12_signature(
message,
cert,
dss,
&self.provider.signature_verification_algorithms,
)
}
fn verify_tls13_signature(
&self,
message: &[u8],
cert: &CertificateDer<'_>,
dss: &DigitallySignedStruct,
) -> Result<HandshakeSignatureValid, rustls::Error> {
rustls::crypto::verify_tls13_signature(
message,
cert,
dss,
&self.provider.signature_verification_algorithms,
)
}
fn supported_verify_schemes(&self) -> Vec<SignatureScheme> {
self.provider.signature_verification_algorithms.supported_schemes()
}
}

96
simulator/src/commands.rs Normal file
View File

@@ -0,0 +1,96 @@
//! Substrate → simulator T3 receiver.
//!
//! The substrate is the brain: when its `automation_system` decides to
//! actuate, it opens a QUIC bidirectional stream to one of its connected
//! devices. The simulator side accepts those streams here, decodes the
//! 39-byte command, applies it to local actuator state, and writes a 39-byte
//! ack back. This closes the loop the paper's three-tier model describes.
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use substrate::transport::{QuicMessage, SensorType};
/// Convenience constructor used by `main.rs` and integration tests.
/// `true` means the simulated engine is running normally.
pub fn new_engine_state() -> Arc<AtomicBool> {
Arc::new(AtomicBool::new(true))
}
/// Loop accepting substrate-initiated bidirectional streams until the
/// connection drops. Each stream is one (command, ack) round-trip:
/// the simulator reads a 39-byte `QuicMessage`, mutates `engine_running` if
/// the command targets the Relay actuator, then writes a 39-byte ack back
/// (echoes the command with the simulator's local timestamp).
pub async fn run_command_receiver(conn: quinn::Connection, engine_running: Arc<AtomicBool>) {
let remote = conn.remote_address();
let mut streams_seen: u64 = 0;
loop {
let (send, recv) = match conn.accept_bi().await {
Ok(s) => s,
Err(e) => {
tracing::debug!(
?remote,
streams_seen,
error = %e,
"command receiver: accept_bi loop ended"
);
return;
}
};
streams_seen += 1;
let engine_running = engine_running.clone();
tokio::spawn(handle_one_command(remote, send, recv, engine_running));
}
}
async fn handle_one_command(
remote: std::net::SocketAddr,
mut send: quinn::SendStream,
mut recv: quinn::RecvStream,
engine_running: Arc<AtomicBool>,
) {
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
if let Err(e) = recv.read_exact(&mut buf).await {
tracing::trace!(?remote, error = %e, "command receiver: short read; closing stream");
return;
}
let cmd = match QuicMessage::decode(&buf) {
Ok(m) => m,
Err(e) => {
tracing::warn!(?remote, error = %e, "command receiver: decode failed");
let _ = send.reset(0u32.into());
return;
}
};
if cmd.typ() == SensorType::Relay {
// raw_value == 1.0 ⇒ stop the engine; 0.0 ⇒ resume.
let now_running = cmd.raw_value < 0.5;
let was_running = engine_running.swap(now_running, Ordering::SeqCst);
if now_running != was_running {
if now_running {
tracing::info!(device = %cmd.device_id, "Relay=0 received — engine resuming");
} else {
tracing::info!(device = %cmd.device_id, "Relay=1 received — engine stopping");
}
}
} else {
tracing::debug!(
?remote,
sensor_type = cmd.sensor_type,
"command receiver: ignoring non-Relay command"
);
}
// Ack by echoing the command — the substrate's outbound drain measures
// latency from open_bi() to ack receipt.
if let Err(e) = send.write_all(&cmd.to_bytes()).await {
tracing::warn!(?remote, error = %e, "command receiver: ack write failed");
return;
}
if let Err(e) = send.finish() {
tracing::warn!(?remote, error = %e, "command receiver: ack finish failed");
}
}

86
simulator/src/emitters.rs Normal file
View File

@@ -0,0 +1,86 @@
//! Async emitter task for T2 (uni streams).
//!
//! Ticks at its own rate, opens a fresh stream per event, and shares a
//! `Connection` with the rest of the simulator. T1 (datagrams) is driven
//! inline by the main loop so the foreground task owns the progress
//! reporting; T2 runs as a `tokio::spawn`ed background task.
//!
//! T3 (actuator commands) is substrate-initiated — the receiver lives in
//! `crate::commands`, not here.
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use substrate::transport::QuicMessage;
use tokio::time::MissedTickBehavior;
use crate::profile::{SensorSlot, generate_value};
/// UNIX-epoch microseconds — the wall-clock timestamp the simulator stamps
/// into every outgoing `QuicMessage`. Substrate-side latency is computed as
/// `substrate_now_us - msg.timestamp_us`, so this needs to be a real wall
/// clock both ends share (NTP for two-machine; loopback otherwise).
pub fn now_us() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_micros() as u64)
.unwrap_or(0)
}
/// T2 emitter — opens a fresh uni stream per event, writes one
/// `QuicMessage`, and `finish`es. Returns the count of events successfully
/// delivered when `interrupted` is raised.
pub async fn run_t2_emitter(
conn: quinn::Connection,
mut slot: SensorSlot,
rate_hz: f64,
interrupted: Arc<AtomicBool>,
engine_running: Arc<AtomicBool>,
counter: Arc<AtomicU64>,
) -> u64 {
let period = Duration::from_nanos((1.0e9 / rate_hz) as u64);
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
let mut sent: u64 = 0;
let mut send = match conn.open_uni().await {
Ok(s) => s,
Err(e) => {
tracing::warn!(error = %e, "T2 open_uni failed; emitter exiting");
return 0;
}
};
loop {
ticker.tick().await;
if interrupted.load(Ordering::SeqCst) {
break;
}
let running = engine_running.load(Ordering::Relaxed);
let msg = QuicMessage {
device_id: slot.device_id,
sensor_id: slot.sensor_id,
raw_value: generate_value(slot.sensor_type, slot.seq, running),
timestamp_us: now_us(),
sequence_number: slot.seq,
sensor_type: slot.sensor_type.as_u8(),
};
slot.seq = slot.seq.wrapping_add(1);
if let Err(e) = send.write_all(&msg.to_bytes()).await {
tracing::warn!(error = %e, "T2 write_all failed; stream closed?");
break;
}
sent += 1;
counter.store(sent, Ordering::Relaxed);
}
if let Err(e) = send.finish() {
tracing::warn!(error = %e, "T2 finish failed");
}
sent
}

13
simulator/src/lib.rs Normal file
View File

@@ -0,0 +1,13 @@
pub mod client;
pub mod commands;
pub mod emitters;
pub mod profile;
/// Install rustls' default crypto provider. Idempotent: safe to call from
/// every test, every binary entry, and the substrate process. The `aws_lc_rs`
/// provider matches what the substrate installs in `main.rs`.
pub fn install_crypto_provider() {
// Returns Err if a provider is already installed; that's the expected
// case in any process that's already booted substrate or a sibling test.
let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
}

View File

@@ -1,3 +1,331 @@
fn main() { //! Manual smoke runner / load driver for the substrate.
println!("Hello, world!"); //!
//! Parses the CLI, builds the per-device sensor layout, then drives T1
//! datagrams in the foreground while T2 and T3 emitters run as background
//! tokio tasks. Helpers live in the simulator library:
//!
//! - `simulator::profile` — `SensorProfile`, `SensorSlot`, waveform generator
//! - `simulator::emitters` — `run_t2_emitter`, `run_t3_emitter`, `now_us`
//! - `simulator::client` — Quinn client + TLS trust-by-cert verifier
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::{Duration, Instant};
use anyhow::{Context, anyhow};
use clap::{Parser, ValueEnum};
use simulator::client::SimulatorClient;
use simulator::commands::{new_engine_state, run_command_receiver};
use simulator::emitters::{now_us, run_t2_emitter};
use simulator::profile::{SensorProfile, build_slots, generate_value};
use substrate::transport::{QuicMessage, SensorType};
use tokio::time::MissedTickBehavior;
use tracing_subscriber::EnvFilter;
#[derive(Parser, Debug)]
#[command(name = "simulator", about, long_about = None)]
struct Cli {
/// Substrate address (host:port).
#[arg(long, default_value = "127.0.0.1:9000")]
addr: SocketAddr,
/// SNI name presented during the TLS handshake.
#[arg(long, default_value = "localhost")]
server_name: String,
/// Path to the substrate's PEM cert; used as the exact-match trust anchor.
#[arg(long, default_value = "certs/server.crt")]
cert: PathBuf,
/// Sensor mix per device.
///
/// - `single` (default): one sensor per device of `--sensor-type`, on
/// `--sensor-id`. Lowest-cardinality, easiest to reason about.
/// - `industrial`: seven sensors per device on ids 0..6 — Temperature,
/// Humidity, Pressure, Voltage, Current, Presence, Relay. Lights up
/// every dashboard panel and primes the closed-loop demo (Presence
/// dips below threshold → substrate dispatches T3 Relay setpoints).
#[arg(long, value_enum, default_value_t = SensorProfile::Single)]
profile: SensorProfile,
/// Sensor type for the `single` profile. Ignored by `industrial`.
#[arg(long, value_enum, default_value_t = CliSensorType::Generic)]
sensor_type: CliSensorType,
/// T1 datagram rate across all (device, sensor) slots (Hz). `0` disables T1.
#[arg(long, default_value_t = 20.0)]
rate_hz: f64,
/// T2 uni-stream event rate (Hz). `0` disables T2 (default).
#[arg(long, default_value_t = 0.0)]
t2_rate_hz: f64,
/// Number of T1 datagrams to send. `0` runs until Ctrl-C.
#[arg(long, default_value_t = 10)]
count: u64,
/// Number of distinct device UUIDs to round-robin.
#[arg(long, default_value_t = 1)]
devices: u32,
/// Sensor index for the `single` profile. Ignored by `industrial`.
#[arg(long, default_value_t = 0)]
sensor_id: u16,
}
#[derive(ValueEnum, Clone, Copy, Debug, Default)]
enum CliSensorType {
#[default]
Generic,
Temperature,
Humidity,
Pressure,
Voltage,
Current,
}
impl From<CliSensorType> for SensorType {
fn from(c: CliSensorType) -> Self {
match c {
CliSensorType::Generic => SensorType::Generic,
CliSensorType::Temperature => SensorType::Temperature,
CliSensorType::Humidity => SensorType::Humidity,
CliSensorType::Pressure => SensorType::Pressure,
CliSensorType::Voltage => SensorType::Voltage,
CliSensorType::Current => SensorType::Current,
}
}
}
fn validate(cli: &Cli) -> anyhow::Result<()> {
if cli.rate_hz < 0.0 {
return Err(anyhow!("--rate-hz must be >= 0"));
}
if cli.t2_rate_hz < 0.0 {
return Err(anyhow!("--t2-rate-hz must be >= 0"));
}
if cli.rate_hz == 0.0 && cli.t2_rate_hz == 0.0 {
return Err(anyhow!(
"at least one of --rate-hz / --t2-rate-hz must be > 0"
));
}
if cli.devices == 0 {
return Err(anyhow!("--devices must be >= 1"));
}
Ok(())
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.init();
let cli = Cli::parse();
validate(&cli)?;
simulator::install_crypto_provider();
let mut slots = build_slots(
cli.profile,
cli.devices,
cli.sensor_type.into(),
cli.sensor_id,
);
tracing::info!(
?cli.addr,
rate_hz = cli.rate_hz,
t2_rate_hz = cli.t2_rate_hz,
count = cli.count,
devices = cli.devices,
slots = slots.len(),
profile = ?cli.profile,
"simulator launching"
);
let client = SimulatorClient::connect(cli.addr, &cli.server_name, &cli.cert)
.await
.context("connect to substrate")?;
let interrupted = Arc::new(AtomicBool::new(false));
{
let flag = interrupted.clone();
tokio::spawn(async move {
let _ = tokio::signal::ctrl_c().await;
tracing::info!("Ctrl-C received, draining…");
flag.store(true, Ordering::SeqCst);
});
}
// Engine state: starts running. Flipped by `run_command_receiver` when
// the substrate's automation_system sends a Relay actuator command.
let engine_running = new_engine_state();
{
let conn = client.conn.clone();
let engine_running = engine_running.clone();
tokio::spawn(async move {
run_command_receiver(conn, engine_running).await;
});
}
// T2 emitter targets slot[0] for its device/sensor identity. T3 commands
// are substrate-initiated; there's no simulator-side emitter for them.
let t2_slot = slots[0].clone();
let t2_sent = Arc::new(AtomicU64::new(0));
let t2_handle = if cli.t2_rate_hz > 0.0 {
let conn = client.conn.clone();
let rate = cli.t2_rate_hz;
let interrupted = interrupted.clone();
let counter = t2_sent.clone();
let engine_running = engine_running.clone();
Some(tokio::spawn(async move {
run_t2_emitter(conn, t2_slot, rate, interrupted, engine_running, counter).await
}))
} else {
None
};
let presence_slot_opt = slots.iter().find(|s| s.sensor_type == SensorType::Presence).cloned();
let conn_clone = client.conn.clone();
if let Some(presence_slot) = presence_slot_opt {
tokio::spawn(async move {
if let Ok(listener) = tokio::net::TcpListener::bind("0.0.0.0:9002").await {
tracing::info!("Simulator HTTP trigger API listening on 0.0.0.0:9002");
while let Ok((mut socket, _)) = listener.accept().await {
let conn = conn_clone.clone();
let slot = presence_slot.clone();
tokio::spawn(async move {
let mut buf = [0; 1024];
use tokio::io::{AsyncReadExt, AsyncWriteExt};
if let Ok(n) = socket.read(&mut buf).await {
let req = String::from_utf8_lossy(&buf[..n]);
if req.starts_with("OPTIONS") {
let res = "HTTP/1.1 204 No Content\r\nAccess-Control-Allow-Origin: *\r\nAccess-Control-Allow-Methods: POST, OPTIONS\r\n\r\n";
let _ = socket.write_all(res.as_bytes()).await;
} else if req.starts_with("POST /trigger") {
if let Ok(mut send) = conn.open_uni().await {
let msg = QuicMessage {
device_id: slot.device_id,
sensor_id: slot.sensor_id,
raw_value: 0.0,
timestamp_us: now_us(),
sequence_number: 0,
sensor_type: slot.sensor_type.as_u8(),
};
let _ = send.write_all(&msg.to_bytes()).await;
let _ = send.finish();
tracing::info!("HTTP API triggered: pushed Presence=0.0 over T2");
}
let res = "HTTP/1.1 200 OK\r\nAccess-Control-Allow-Origin: *\r\n\r\nTriggered";
let _ = socket.write_all(res.as_bytes()).await;
} else {
let res = "HTTP/1.1 404 Not Found\r\nAccess-Control-Allow-Origin: *\r\n\r\n";
let _ = socket.write_all(res.as_bytes()).await;
}
}
});
}
}
});
}
let started = Instant::now();
let mut t1_sent: u64 = 0;
let mut send_errors: u64 = 0;
if cli.rate_hz > 0.0 {
let period = Duration::from_nanos((1.0e9 / cli.rate_hz) as u64);
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
let unlimited = cli.count == 0;
let mut last_progress = started;
loop {
ticker.tick().await;
if interrupted.load(Ordering::SeqCst) {
break;
}
if !unlimited && t1_sent >= cli.count {
break;
}
let slot_idx = (t1_sent as usize) % slots.len();
let running = engine_running.load(Ordering::Relaxed);
let slot = &mut slots[slot_idx];
let msg = QuicMessage {
device_id: slot.device_id,
sensor_id: slot.sensor_id,
raw_value: generate_value(slot.sensor_type, slot.seq, running),
timestamp_us: now_us(),
sequence_number: slot.seq,
sensor_type: slot.sensor_type.as_u8(),
};
slot.seq = slot.seq.wrapping_add(1);
t1_sent += 1;
if let Err(e) = client.send_datagram(&msg) {
send_errors += 1;
tracing::warn!(error = %e, "send_datagram failed");
}
let now = Instant::now();
if now.duration_since(last_progress) >= Duration::from_secs(1) {
let elapsed = now.duration_since(started).as_secs_f64();
let t1_hz = (t1_sent as f64) / elapsed.max(1e-9);
let t2_now = t2_sent.load(Ordering::Relaxed);
let t2_hz = (t2_now as f64) / elapsed.max(1e-9);
let engine_state = if engine_running.load(Ordering::Relaxed) {
"running"
} else {
"stopped"
};
tracing::info!(
t1_sent,
t2_sent = t2_now,
send_errors,
t1_hz = format_args!("{:.1}", t1_hz),
t2_hz = format_args!("{:.1}", t2_hz),
engine = engine_state,
"progress"
);
last_progress = now;
}
}
} else {
while !interrupted.load(Ordering::SeqCst) {
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
interrupted.store(true, Ordering::SeqCst);
let t2_total: u64 = match t2_handle {
Some(h) => h.await.unwrap_or_else(|e| {
tracing::warn!(error = %e, "T2 emitter task ended unexpectedly");
0
}),
None => 0,
};
let elapsed = started.elapsed().as_secs_f64();
let t1_hz = (t1_sent as f64) / elapsed.max(1e-9);
let t2_hz = (t2_total as f64) / elapsed.max(1e-9);
tracing::info!(
t1_sent,
t2_sent = t2_total,
send_errors,
elapsed_s = format_args!("{:.3}", elapsed),
t1_observed_hz = format_args!("{:.1}", t1_hz),
t2_observed_hz = format_args!("{:.1}", t2_hz),
"simulator done"
);
client.close().await;
Ok(())
} }

106
simulator/src/profile.rs Normal file
View File

@@ -0,0 +1,106 @@
//! Per-device sensor layout (the `--profile` CLI flag's runtime form) and the
//! type-appropriate waveform generators that feed the substrate's Grafana
//! dashboard with believable numbers.
use clap::ValueEnum;
use substrate::transport::SensorType;
use uuid::Uuid;
/// Per-device sensor layout selected by the `--profile` CLI flag.
///
/// - `Single`: one sensor per device of a chosen `SensorType`. Lowest
/// cardinality; the right pick for throughput / latency benchmarks.
/// - `Industrial`: five sensors per device on ids 0..4 — Temperature,
/// Humidity, Pressure, Voltage, Current. Lights up every sensor-type
/// panel in the operator dashboard.
#[derive(ValueEnum, Clone, Copy, Debug)]
pub enum SensorProfile {
Single,
Industrial,
}
/// A single emitter slot: the `(device, sensor, type)` triple plus the
/// per-slot monotonic sequence counter that the simulator advances on every
/// outgoing message.
#[derive(Clone, Debug)]
pub struct SensorSlot {
pub device_id: Uuid,
pub sensor_id: u16,
pub sensor_type: SensorType,
pub seq: u32,
}
/// Expand a `(profile, num_devices)` choice into the flat list of slots
/// the T1 emitter rotates through. Each device gets a fresh UUID.
pub fn build_slots(
profile: SensorProfile,
num_devices: u32,
default_type: SensorType,
default_sensor_id: u16,
) -> Vec<SensorSlot> {
let mut slots = Vec::new();
for _ in 0..num_devices {
let device_id = Uuid::new_v4();
match profile {
SensorProfile::Single => {
slots.push(SensorSlot {
device_id,
sensor_id: default_sensor_id,
sensor_type: default_type,
seq: 0,
});
}
SensorProfile::Industrial => {
for (sensor_id, sensor_type) in [
(0u16, SensorType::Temperature),
(1, SensorType::Humidity),
(2, SensorType::Pressure),
(3, SensorType::Voltage),
(4, SensorType::Current),
(5, SensorType::Presence),
(6, SensorType::Relay),
] {
slots.push(SensorSlot {
device_id,
sensor_id,
sensor_type,
seq: 0,
});
}
}
}
}
slots
}
/// Type-appropriate waveform so the dashboard has something believable to
/// render. `seq` is the sample index — multiplying by 0.05 gives a
/// "seconds-like" wall-clock pacing inside the trig functions regardless of
/// the actual send rate, so panels animate over the same visible period.
///
/// `engine_running` couples Voltage/Current to the simulated machine state.
/// When the substrate's `automation_system` sends a Relay=stop command, the
/// receiver flips the flag and the next current sample drops to ~0 A while
/// Voltage stays on mains — the dashboard sees the engine spin down within
/// one ECS tick.
pub fn generate_value(t: SensorType, seq: u32, engine_running: bool) -> f64 {
let t_phase = (seq as f64) * 0.05;
match t {
SensorType::Temperature => 20.0 + 5.0 * (t_phase / 10.0).sin(),
SensorType::Humidity => 50.0 + 20.0 * (t_phase / 15.0).sin(),
SensorType::Pressure => 1013.0 + 5.0 * (t_phase / 20.0).cos(),
// Voltage is the mains: stable at ~230 V regardless of motor state.
SensorType::Voltage => 230.0 + 0.5 * (t_phase / 3.0).sin(),
// Current reflects motor draw: ~10 A running, ~0 A stopped.
SensorType::Current => {
if engine_running {
10.0 + 2.0 * (t_phase / 5.0).cos()
} else {
0.05 + 0.05 * (t_phase / 5.0).cos().abs()
}
}
SensorType::Presence => 2.0 + 1.5 * (t_phase / 5.0).sin(), // Drops below 1.0 occasionally
SensorType::Relay => 0.0, // Outbound is substrate-initiated; this is unused on the simulator side.
SensorType::Generic => t_phase.sin(),
}
}

View File

@@ -0,0 +1,188 @@
//! Full closed-loop integration test:
//!
//! 1. Simulator emits a Presence sensor reading via T2 (`raw_value < 1.0`).
//! 2. Substrate's `automation_system` detects threshold crossing.
//! 3. Substrate opens a T3 bi-stream and writes a `Relay=stop` command.
//! 4. Simulator's `run_command_receiver` decodes the command, flips
//! `engine_running` to `false`, and writes the 39-byte ack back.
//!
//! Then we recover: send Presence > 1.0, observe the substrate dispatches
//! `Relay=resume`, and the simulator's flag flips back to `true`.
//!
//! This test stands up the *real* substrate machinery — `accept_loop` plus
//! `drain_outbound_t3` plus the ECS world's `automation_system` driving a
//! `BridgeSenders` — so a regression in any of the three pieces fails here.
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};
use anyhow::Result;
use simulator::client::SimulatorClient;
use simulator::commands::{new_engine_state, run_command_receiver};
use substrate::config::QuicConfig;
use substrate::transport::server::{accept_loop, bind_endpoint, new_connection_registry};
use substrate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender, T3OutboundSender};
use tokio::sync::mpsc;
use uuid::Uuid;
fn cert_path(name: &str) -> PathBuf {
[env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect()
}
fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig {
QuicConfig {
server_port: 0,
server_interface: "127.0.0.1".to_string(),
server_cert: cert.to_string_lossy().into_owned(),
server_key: key.to_string_lossy().into_owned(),
t1_capacity: 1024,
t2_capacity: 512,
t3_capacity: 256,
synthetic_t3_rate_hz: 0.0,
}
}
/// Build a minimal substrate world that runs `automation_system` against
/// test-owned channels.
///
/// We don't construct a Bevy `App` here — the world tests already cover
/// `automation_system` end-to-end with the `WorldPlugin`. This test focuses
/// on the *transport* round-trip: T2 in, T3 out, with a real `accept_loop`
/// and `drain_outbound_t3` doing the work.
///
/// We model the substrate side as: read T2 messages off the bridge receiver,
/// detect Presence crossings inline, push `OutboundT3` commands. The real
/// `automation_system` does the same thing inside the Bevy schedule; for
/// this test, the inline driver keeps the test focused on the transport.
async fn substrate_automation_proxy(
mut t2_rx: mpsc::Receiver<QuicMessage>,
t3_out: T3OutboundSender,
) {
let mut last_relay: f64 = 0.0;
while let Some(msg) = t2_rx.recv().await {
if msg.typ() != SensorType::Presence {
continue;
}
let relay: f64 = if msg.raw_value < 1.0 { 1.0 } else { 0.0 };
if (relay - last_relay).abs() < 1e-6 {
continue; // no state change, no command
}
last_relay = relay;
let _ = t3_out.try_send(OutboundT3 {
target_device: msg.device_id,
sensor_id: 6,
raw_value: relay,
sensor_type: SensorType::Relay.as_u8(),
});
}
}
async fn poll_for<F>(timeout: Duration, predicate: F) -> bool
where
F: Fn() -> bool,
{
let started = Instant::now();
while started.elapsed() < timeout {
if predicate() {
return true;
}
tokio::time::sleep(Duration::from_millis(10)).await;
}
false
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn presence_drop_triggers_engine_stop_and_recovery_resumes_it() -> Result<()> {
simulator::install_crypto_provider();
let cert = cert_path("server.crt");
let key = cert_path("server.key");
let cfg = loopback_config(cert.clone(), key);
// --- substrate side ---
let endpoint = bind_endpoint(&cfg)?;
let server_addr: SocketAddr = endpoint.local_addr()?;
let (t1_tx, _t1_rx) = mpsc::channel::<QuicMessage>(64);
let (t2_tx, t2_rx) = mpsc::channel::<QuicMessage>(64);
// Two outbound channels in this test: the substrate's real
// outbound-T3 channel (consumed by drain_outbound_t3 inside accept_loop)
// and the inline automation proxy that produces into it. We pass a
// sender clone twice — once for the proxy, once for accept_loop's
// synthetic-driver hook (which we disable here by passing rate 0.0).
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let registry = new_connection_registry();
let server_task = tokio::spawn(accept_loop(
endpoint,
T1Sender::new(t1_tx),
T2Sender::new(t2_tx),
registry,
t3_out_rx,
t3_out_tx.clone(),
0.0,
));
// Inline automation: read T2 Presence events, emit Relay commands.
let proxy = tokio::spawn(substrate_automation_proxy(
t2_rx,
T3OutboundSender::new(t3_out_tx),
));
// --- simulator side ---
let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?;
let engine_running: Arc<AtomicBool> = new_engine_state();
{
let conn = client.conn.clone();
let flag = engine_running.clone();
tokio::spawn(async move { run_command_receiver(conn, flag).await });
}
let device = Uuid::from_u128(0x1111_2222_3333_4444_5555_6666_7777_8888);
let make_presence = |raw: f64, seq: u32| QuicMessage {
device_id: device,
sensor_id: 5,
raw_value: raw,
timestamp_us: 1_700_000_000_000_000 + u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Presence.as_u8(),
};
// 1) Engine starts running.
assert!(engine_running.load(Ordering::SeqCst), "engine should start in running state");
// 2) Push Presence < 1.0 via T2 → expect the substrate to dispatch
// Relay=stop and the simulator's receiver to flip the flag.
client.send_uni_stream(&[make_presence(0.5, 0)]).await?;
let stopped = poll_for(Duration::from_secs(3), || {
!engine_running.load(Ordering::SeqCst)
})
.await;
assert!(
stopped,
"engine_running did not flip to false within 3 s of the substrate \
receiving Presence=0.5; the substrate→simulator T3 path is broken"
);
// 3) Push Presence > 1.0 → expect Relay=resume → flag flips back to true.
client.send_uni_stream(&[make_presence(2.5, 1)]).await?;
let resumed = poll_for(Duration::from_secs(3), || {
engine_running.load(Ordering::SeqCst)
})
.await;
assert!(
resumed,
"engine_running did not flip back to true after Presence=2.5; \
recovery half of the closed loop is broken"
);
client.close().await;
proxy.abort();
server_task.abort();
Ok(())
}

View File

@@ -0,0 +1,151 @@
//! End-to-end T1 datagram test: spin up substrate's listener in-process with
//! channels the test owns, drive a `SimulatorClient` against it, and assert
//! the datagram lands in the T1 receiver decoded.
//!
//! Run with `cargo test -p simulator`.
use std::net::SocketAddr;
use std::path::PathBuf;
use std::time::Duration;
use anyhow::Result;
use simulator::client::SimulatorClient;
use substrate::config::QuicConfig;
use substrate::transport::server::{accept_loop, bind_endpoint, new_connection_registry};
use substrate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender};
use tokio::sync::mpsc;
use uuid::Uuid;
fn cert_path(name: &str) -> PathBuf {
[env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect()
}
fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig {
QuicConfig {
// Port 0 lets the OS pick a free ephemeral port — tests can run in
// parallel without colliding on a fixed bind.
server_port: 0,
server_interface: "127.0.0.1".to_string(),
server_cert: cert.to_string_lossy().into_owned(),
server_key: key.to_string_lossy().into_owned(),
t1_capacity: 1024,
t2_capacity: 512,
t3_capacity: 256,
synthetic_t3_rate_hz: 0.0,
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn t1_datagram_decoded_into_ecs_channel() -> Result<()> {
simulator::install_crypto_provider();
let cert = cert_path("server.crt");
let key = cert_path("server.key");
let cfg = loopback_config(cert.clone(), key);
// Bind the substrate's listener on an ephemeral port.
let endpoint = bind_endpoint(&cfg)?;
let server_addr: SocketAddr = endpoint.local_addr()?;
// Channels the test owns — gives us direct visibility into what the T1
// demux pushes into the ECS bridge.
let (t1_tx, mut t1_rx) = mpsc::channel(64);
let (t2_tx, _t2_rx) = mpsc::channel(64);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let registry = new_connection_registry();
let server_task = tokio::spawn(accept_loop(
endpoint,
T1Sender::new(t1_tx),
T2Sender::new(t2_tx),
registry,
t3_out_rx,
t3_out_tx,
0.0, // synthetic driver disabled
));
// Connect a client and send one datagram.
let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?;
let sent = QuicMessage {
device_id: Uuid::from_u128(0xdead_beef_cafe_f00d_1234_5678_90ab_cdef),
sensor_id: 7,
raw_value: 42.0,
timestamp_us: 1_700_000_000_000_001,
sequence_number: 1,
sensor_type: SensorType::Temperature.as_u8(),
};
client.send_datagram(&sent)?;
// Wait for the substrate's read_datagrams reader to push it into T1.
let received = tokio::time::timeout(Duration::from_secs(2), t1_rx.recv())
.await
.expect("did not observe T1 datagram within 2s")
.expect("T1 channel closed unexpectedly");
assert_eq!(received, sent);
client.close().await;
server_task.abort();
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn t1_burst_preserves_order_and_count() -> Result<()> {
simulator::install_crypto_provider();
let cert = cert_path("server.crt");
let key = cert_path("server.key");
let cfg = loopback_config(cert.clone(), key);
let endpoint = bind_endpoint(&cfg)?;
let server_addr: SocketAddr = endpoint.local_addr()?;
// T1 capacity 64 ≥ burst size 32 so nothing is dropped under loopback.
let (t1_tx, mut t1_rx) = mpsc::channel(64);
let (t2_tx, _t2_rx) = mpsc::channel(8);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(8);
let registry = new_connection_registry();
let server_task = tokio::spawn(accept_loop(
endpoint,
T1Sender::new(t1_tx),
T2Sender::new(t2_tx),
registry,
t3_out_rx,
t3_out_tx,
0.0,
));
let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?;
let device = Uuid::from_u128(0xa1a2_a3a4_b5b6_b7b8_c9ca_cbcc_cdce_cfd0);
const BURST: u32 = 32;
for seq in 0..BURST {
let msg = QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: f64::from(seq),
timestamp_us: 1_700_000_000_000_000 + u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Generic.as_u8(),
};
client.send_datagram(&msg)?;
}
// Drain BURST messages with a per-message timeout. Loopback shouldn't
// reorder QUIC datagrams within a single connection.
for expected_seq in 0..BURST {
let msg = tokio::time::timeout(Duration::from_secs(2), t1_rx.recv())
.await
.unwrap_or_else(|_| panic!("missed datagram seq={expected_seq}"))
.expect("T1 channel closed");
assert_eq!(msg.sequence_number, expected_seq);
assert_eq!(msg.device_id, device);
assert_eq!(msg.raw_value, f64::from(expected_seq));
}
client.close().await;
server_task.abort();
Ok(())
}

View File

@@ -0,0 +1,175 @@
//! End-to-end T2 (unidirectional stream) tests. Mirrors the T1 harness:
//! spin up substrate's listener with channels owned by the test, drive a
//! `SimulatorClient` against it, assert what arrives on the T2 receiver.
//!
//! Run with `cargo test -p simulator`.
use std::collections::HashMap;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::time::Duration;
use anyhow::Result;
use simulator::client::SimulatorClient;
use substrate::config::QuicConfig;
use substrate::transport::server::{accept_loop, bind_endpoint, new_connection_registry};
use substrate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender};
use tokio::sync::mpsc;
use uuid::Uuid;
fn cert_path(name: &str) -> PathBuf {
[env!("CARGO_MANIFEST_DIR"), "..", "certs", name].iter().collect()
}
fn loopback_config(cert: PathBuf, key: PathBuf) -> QuicConfig {
QuicConfig {
server_port: 0,
server_interface: "127.0.0.1".to_string(),
server_cert: cert.to_string_lossy().into_owned(),
server_key: key.to_string_lossy().into_owned(),
t1_capacity: 1024,
t2_capacity: 512,
t3_capacity: 256,
synthetic_t3_rate_hz: 0.0,
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn t2_single_stream_preserves_order() -> Result<()> {
simulator::install_crypto_provider();
let cert = cert_path("server.crt");
let key = cert_path("server.key");
let cfg = loopback_config(cert.clone(), key);
let endpoint = bind_endpoint(&cfg)?;
let server_addr: SocketAddr = endpoint.local_addr()?;
let (t1_tx, _t1_rx) = mpsc::channel(64);
let (t2_tx, mut t2_rx) = mpsc::channel(64);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let registry = new_connection_registry();
let server_task = tokio::spawn(accept_loop(
endpoint,
T1Sender::new(t1_tx),
T2Sender::new(t2_tx),
registry,
t3_out_rx,
t3_out_tx,
0.0,
));
let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?;
let device = Uuid::from_u128(0x0011_2233_4455_6677_8899_aabb_ccdd_eeff);
const N: u32 = 10;
let msgs: Vec<QuicMessage> = (0..N)
.map(|i| QuicMessage {
device_id: device,
sensor_id: 1,
raw_value: f64::from(i),
timestamp_us: 1_700_000_000_000_000 + u64::from(i),
sequence_number: i,
sensor_type: SensorType::Pressure.as_u8(),
})
.collect();
client.send_uni_stream(&msgs).await?;
for expected in &msgs {
let received = tokio::time::timeout(Duration::from_secs(2), t2_rx.recv())
.await
.expect("missed T2 message")
.expect("T2 channel closed unexpectedly");
assert_eq!(received, *expected);
}
client.close().await;
server_task.abort();
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn t2_concurrent_streams_each_internally_ordered() -> Result<()> {
simulator::install_crypto_provider();
let cert = cert_path("server.crt");
let key = cert_path("server.key");
let cfg = loopback_config(cert.clone(), key);
let endpoint = bind_endpoint(&cfg)?;
let server_addr: SocketAddr = endpoint.local_addr()?;
let (t1_tx, _t1_rx) = mpsc::channel(64);
let (t2_tx, mut t2_rx) = mpsc::channel(256);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let registry = new_connection_registry();
let server_task = tokio::spawn(accept_loop(
endpoint,
T1Sender::new(t1_tx),
T2Sender::new(t2_tx),
registry,
t3_out_rx,
t3_out_tx,
0.0,
));
let client = SimulatorClient::connect(server_addr, "localhost", &cert).await?;
// 4 devices × 8 messages each on independent uni streams. Cross-stream
// ordering may interleave; per-stream ordering must be strict.
const DEVICES: usize = 4;
const PER_DEVICE: u32 = 8;
let device_ids: Vec<Uuid> = (0..DEVICES).map(|_| Uuid::new_v4()).collect();
let mut handles = Vec::with_capacity(DEVICES);
for &device in &device_ids {
let conn = client.conn.clone();
handles.push(tokio::spawn(async move {
let msgs: Vec<QuicMessage> = (0..PER_DEVICE)
.map(|i| QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: f64::from(i),
timestamp_us: 1_700_000_000_000_000 + u64::from(i),
sequence_number: i,
sensor_type: SensorType::Generic.as_u8(),
})
.collect();
// Use the connection directly so each task owns its own stream
// — same wire pattern as `SimulatorClient::send_uni_stream`.
let mut send = conn.open_uni().await.expect("open_uni");
for m in &msgs {
send.write_all(&m.to_bytes()).await.expect("write_all");
}
send.finish().expect("finish");
}));
}
for h in handles {
h.await?;
}
// Drain DEVICES × PER_DEVICE messages, group by device, assert per-device
// sequence numbers are strictly increasing from 0.
let total = DEVICES * PER_DEVICE as usize;
let mut by_device: HashMap<Uuid, Vec<u32>> = HashMap::new();
for _ in 0..total {
let msg = tokio::time::timeout(Duration::from_secs(2), t2_rx.recv())
.await
.expect("missed T2 message")
.expect("T2 channel closed unexpectedly");
by_device.entry(msg.device_id).or_default().push(msg.sequence_number);
}
assert_eq!(by_device.len(), DEVICES, "expected one entry per device");
for (dev, seqs) in &by_device {
let expected: Vec<u32> = (0..PER_DEVICE).collect();
assert_eq!(seqs, &expected, "out-of-order or missing sequence for {dev}");
}
client.close().await;
server_task.abort();
Ok(())
}

View File

@@ -4,3 +4,19 @@ version = "0.1.0"
edition = "2024" edition = "2024"
[dependencies] [dependencies]
bevy = { version = "0.18", default-features = false, features = ["bevy_state"] }
thiserror = "2"
anyhow = "1"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
quinn = { version = "0.11" }
rustls = { version = "0.23" }
rustls-pemfile = "2"
rustls-pki-types = "1"
tokio = { version = "1", features = ["full"] }
uuid = { version = "1.23", features = ["v4"] }
figment = { version = "0.10", features = ["toml", "env"] }
serde = { version = "1", features = ["derive"] }
metrics = "0.24"
metrics-exporter-prometheus = "0.17"
memory-stats = "1"

81
substrate/src/config.rs Normal file
View File

@@ -0,0 +1,81 @@
use bevy::prelude::Resource;
use figment::Figment;
use figment::providers::{Env, Format, Serialized, Toml};
use serde::{Deserialize, Serialize};
#[derive(Debug, Resource, Serialize, Deserialize)]
pub struct AppConfig {
pub network: QuicConfig,
pub simulation: SimulationConfig,
pub observability: ObservabilityConfig,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SimulationConfig {
pub tick_rate_hz: u32,
pub max_entities: usize,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct QuicConfig {
pub server_port: u16,
pub server_interface: String,
pub server_cert: String,
pub server_key: String,
pub t1_capacity: usize,
pub t2_capacity: usize,
pub t3_capacity: usize,
/// Bench-only knob. When > 0, the substrate spawns a synthetic T3
/// driver that issues toggling Relay commands to every connected device
/// at the configured rate, exercising the real outbound code path.
/// Off by default (0.0) in production. Override via env:
/// `APP_NETWORK__SYNTHETIC_T3_RATE_HZ=100`.
#[serde(default)]
pub synthetic_t3_rate_hz: f64,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ObservabilityConfig {
/// When true, install the Prometheus exporter at startup. Disable for
/// environments where the metrics port collides or scraping is undesired.
pub metrics_enabled: bool,
/// Bind address for the `/metrics` HTTP listener.
pub metrics_listen: String,
}
impl Default for AppConfig {
fn default() -> Self {
Self {
network : QuicConfig {
server_port: 9000,
server_interface: "0.0.0.0".to_string(),
server_cert: "certs/server.crt".to_string(),
server_key: "certs/server.key".to_string(),
t1_capacity: 1024,
t2_capacity: 512,
t3_capacity: 256,
synthetic_t3_rate_hz: 0.0,
},
simulation: SimulationConfig {
tick_rate_hz: 60,
max_entities: 10000,
},
observability: ObservabilityConfig {
metrics_enabled: true,
metrics_listen: "0.0.0.0:9100".to_string(),
},
}
}
}
impl AppConfig {
pub fn load(config_file: &str) -> Result<Self, figment::Error> {
Figment::new()
.merge(Serialized::defaults(Self::default())) // compiled-in defaults
.merge(Toml::file(config_file)) // config file
// env overrides — `__` is the nesting separator so
// `APP_NETWORK__SERVER_PORT=9001` overrides `network.server_port`.
.merge(Env::prefixed("APP_").split("__"))
.extract()
}
}

4
substrate/src/lib.rs Normal file
View File

@@ -0,0 +1,4 @@
pub mod config;
pub mod observability;
pub mod transport;
pub mod world;

View File

@@ -1,3 +1,34 @@
use bevy::prelude::*;
use tracing_subscriber::EnvFilter;
use substrate::config::AppConfig;
use substrate::observability::ObservabilityPlugin;
use substrate::transport;
use substrate::world::WorldPlugin;
fn main() { fn main() {
println!("Hello, world!"); tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.init();
// rustls 0.23 requires an explicit default crypto provider. Quinn's
// ServerConfig::with_single_cert otherwise panics at first use.
rustls::crypto::aws_lc_rs::default_provider()
.install_default()
.expect("install rustls default crypto provider");
let config = AppConfig::load("config.toml").expect("Failed to load config");
tracing::info!(?config, "substrate starting");
// Plugin order matters: EcsQuicTransportPlugin inserts the TokioHandle
// resource ObservabilityPlugin reads in its `build()`.
App::new()
.insert_resource(config)
.add_plugins(MinimalPlugins)
.add_plugins(transport::ecs::EcsQuicTransportPlugin)
.add_plugins(WorldPlugin)
.add_plugins(ObservabilityPlugin)
.run();
} }

View File

@@ -0,0 +1,116 @@
//! M5 — Prometheus-format `/metrics` exporter installation and counter
//! pre-registration.
//!
//! Counters and histograms are emitted from the demux path
//! ([`crate::transport::server`]) and the world systems
//! ([`crate::world::ingest_system`], [`crate::world::simulation_system`],
//! [`crate::world::export_system`]). This module's only job is:
//!
//! 1. Install the global metrics recorder + HTTP listener on the existing
//! tokio runtime, once at startup.
//! 2. Pre-register every counter at value 0 so panels render "0" rather than
//! "No data" before the first event of a given kind fires.
//!
//! ## Runtime telemetry
//!
//! - `substrate_received_total{tier=t1|t2|t3}` — counter
//! - `substrate_dropped_total{tier=t1}` — counter (T1 lossy)
//! - `substrate_decode_errors_total{tier=t1|t2|t3}` — counter
//! - `substrate_t3_no_handler_total` — counter
//! - `substrate_latency_us{tier=t1|t2|t3}` — histogram
//! - `substrate_tick_hz` — gauge
//! - `substrate_entities` — gauge
//! - `substrate_channel_depth{tier=t1|t2|t3}` — gauge
//! - `substrate_channel_capacity{tier=t1|t2|t3}` — gauge
//! - `substrate_rss_bytes` — gauge
//!
//! ## Digital-twin surface (operator dashboard)
//!
//! - `sensor_aggregate{type=…, stat=count|mean|min|max}` — gauge
//! - `substrate_threshold_crossings_total{type, direction}` — counter
use std::net::SocketAddr;
use bevy::prelude::*;
use metrics::counter;
use metrics_exporter_prometheus::PrometheusBuilder;
use crate::config::AppConfig;
use crate::transport::SensorType;
use crate::transport::ecs::TokioHandle;
pub struct ObservabilityPlugin;
impl Plugin for ObservabilityPlugin {
fn build(&self, app: &mut App) {
let config = app
.world()
.get_resource::<AppConfig>()
.expect("AppConfig must be inserted before ObservabilityPlugin");
if !config.observability.metrics_enabled {
tracing::info!("metrics exporter disabled by config");
return;
}
let listen: SocketAddr = config
.observability
.metrics_listen
.parse()
.expect("invalid metrics_listen address in config");
let runtime_handle = app
.world()
.get_resource::<TokioHandle>()
.expect("TokioHandle must be inserted before ObservabilityPlugin (load order: transport plugin first)")
.0
.clone();
// PrometheusBuilder::install spawns the HTTP listener via tokio::spawn,
// which requires being inside a runtime context.
let _guard = runtime_handle.enter();
PrometheusBuilder::new()
.with_http_listener(listen)
.install()
.expect("install prometheus exporter");
drop(_guard);
tracing::info!(?listen, "metrics exporter installed");
pre_register_counters();
}
}
/// Pre-register every counter at value 0 so Grafana sees a series to plot
/// even before the first event of that kind. Without this, the Prometheus
/// exporter omits any counter that has never been incremented, and panels
/// render "No data" — confusing when the metric exists, the counter is just
/// genuinely zero (e.g., `substrate_t3_no_handler_total` in normal operation).
fn pre_register_counters() {
for tier in ["t1", "t2", "t3"] {
counter!("substrate_received_total", "tier" => tier).increment(0);
counter!("substrate_decode_errors_total", "tier" => tier).increment(0);
}
counter!("substrate_dropped_total", "tier" => "t1").increment(0);
counter!("substrate_t3_no_handler_total").increment(0);
// Threshold crossings — bounded `|SensorType| × 2` cardinality, all
// pre-registered so dashboard panels show "0" instead of "No data".
for t in [
SensorType::Generic,
SensorType::Temperature,
SensorType::Humidity,
SensorType::Pressure,
SensorType::Voltage,
SensorType::Current,
] {
for direction in ["up", "down"] {
counter!(
"substrate_threshold_crossings_total",
"type" => t.label_str(),
"direction" => direction
)
.increment(0);
}
}
}

View File

@@ -0,0 +1,153 @@
use std::sync::Mutex;
use bevy::prelude::*;
use bevy::state::app::StatesPlugin;
use tokio::runtime::Handle;
use tokio::sync::mpsc;
use crate::config::AppConfig;
use crate::transport::{OutboundT3, QuicMessage, T1Sender, T2Sender, T3OutboundSender};
use crate::transport::server::{ConnectionRegistry, accept_loop, bind_endpoint, new_connection_registry};
use crate::transport::state::ServerState;
pub struct EcsQuicTransportPlugin;
/// Receive halves of the inbound tier channels (T1 datagrams, T2 uni
/// streams). The `world` module's ingest system is the sole reader.
/// T3 is substrate-initiated and lives on the tokio side via the outbound
/// drain task — no inbound T3 receiver exists here.
#[derive(Resource)]
pub(crate) struct BridgeReceivers {
pub(crate) t1: Mutex<mpsc::Receiver<QuicMessage>>,
pub(crate) t2: Mutex<mpsc::Receiver<QuicMessage>>,
}
#[derive(Resource, Clone)]
pub(crate) struct BridgeSenders {
pub(crate) t1: T1Sender,
pub(crate) t2: T2Sender,
/// Outbound actuator-command sender — `automation_system` enqueues
/// `OutboundT3` items here; the tokio drain task routes them to the
/// originating device's connection.
pub(crate) t3_out: T3OutboundSender,
}
/// Holds the receiver half of the outbound-T3 channel until the listener
/// starts, plus the connection registry and a sender clone for the optional
/// synthetic T3 driver. All pass into `accept_loop` once at the
/// `Starting → Started` transition.
#[derive(Resource)]
pub(crate) struct OutboundT3Plumbing {
pub(crate) rx: Mutex<Option<mpsc::Receiver<OutboundT3>>>,
pub(crate) tx: mpsc::Sender<OutboundT3>,
pub(crate) registry: ConnectionRegistry,
}
#[derive(Resource, Clone)]
pub(crate) struct TokioHandle(pub(crate) Handle);
/// Bring up the QUIC listener using the loaded `AppConfig` and transition to
/// `ServerState::Started`. Runs once via `OnEnter(ServerState::Starting)`.
fn start_quic_server(
config: Res<AppConfig>,
senders: Res<BridgeSenders>,
runtime: Res<TokioHandle>,
outbound: Res<OutboundT3Plumbing>,
mut next: ResMut<NextState<ServerState>>,
) {
tracing::info!("entering ServerState::Starting — bringing up QUIC listener");
// `Endpoint::server` is sync but needs a tokio runtime context for
// `Handle::current()`; entering the runtime is enough — no async block
// required.
let _guard = runtime.0.enter();
let endpoint = bind_endpoint(&config.network).expect("failed to bind QUIC endpoint");
drop(_guard);
tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC listener bound");
// Move the outbound receiver into the tokio side; accept_loop owns it for
// the rest of the listener's life. The registry is cloned (it's already an
// `Arc`) so the ECS-side resource can still observe the routes if needed.
let outbound_rx = outbound
.rx
.lock()
.unwrap()
.take()
.expect("OutboundT3 receiver consumed twice");
let outbound_tx = outbound.tx.clone();
let registry = outbound.registry.clone();
let synthetic_rate = config.network.synthetic_t3_rate_hz;
let s = senders.clone();
runtime.0.spawn(accept_loop(
endpoint,
s.t1,
s.t2,
registry,
outbound_rx,
outbound_tx,
synthetic_rate,
));
next.set(ServerState::Started);
tracing::info!("ServerState::Started");
}
impl Plugin for EcsQuicTransportPlugin {
fn build(&self, app: &mut App) {
let config = app.world_mut().resource::<AppConfig>();
// Inbound bridge: T1 datagrams + T2 uni streams from devices into the
// ECS PreUpdate ingest system (in the `world` module).
let (t1_tx, t1_rx) = mpsc::channel::<QuicMessage>(config.network.t1_capacity);
let (t2_tx, t2_rx) = mpsc::channel::<QuicMessage>(config.network.t2_capacity);
// Outbound-T3: substrate → device actuator-command path. Capacity
// budget tracks automation cadence, not per-sample throughput.
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(config.network.t3_capacity);
let registry = new_connection_registry();
// Spawn a tokio runtime on a dedicated OS thread, ship its Handle back
// to the ECS, and keep the runtime alive for the lifetime of the app
// by parking on `pending()`.
let (handle_tx, handle_rx) = std::sync::mpsc::sync_channel::<Handle>(1);
std::thread::Builder::new()
.name("quic-runtime".to_string())
.spawn(move || {
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(2)
.enable_all()
.thread_name("quic-worker")
.build()
.expect("build tokio runtime");
handle_tx
.send(rt.handle().clone())
.expect("send tokio Handle to ECS");
rt.block_on(std::future::pending::<()>());
})
.expect("spawn quic-runtime thread");
let handle = handle_rx.recv().expect("receive tokio Handle");
// Bevy 0.18 split state machinery into its own plugin; under
// MinimalPlugins it isn't installed by default.
app.add_plugins(StatesPlugin)
.init_state::<ServerState>()
.insert_resource(TokioHandle(handle))
.insert_resource(BridgeSenders {
t1: T1Sender::new(t1_tx),
t2: T2Sender::new(t2_tx),
t3_out: T3OutboundSender::new(t3_out_tx.clone()),
})
.insert_resource(BridgeReceivers {
t1: Mutex::new(t1_rx),
t2: Mutex::new(t2_rx),
})
.insert_resource(OutboundT3Plumbing {
rx: Mutex::new(Some(t3_out_rx)),
tx: t3_out_tx,
registry,
})
.add_systems(OnEnter(ServerState::Starting), start_quic_server);
}
}

View File

@@ -0,0 +1,328 @@
pub mod ecs;
pub mod server;
pub mod state;
use tokio::sync::mpsc;
/// Logical type of a sensor reading. Travels in `QuicMessage::sensor_type`
/// so the substrate (and any downstream dashboard) knows which units / range
/// / visualisation applies to the `raw_value`.
///
/// Forward compat: unknown discriminants decode as `Generic`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
#[repr(u8)]
pub enum SensorType {
#[default]
Generic = 0,
Temperature = 1,
Humidity = 2,
Pressure = 3,
Voltage = 4,
Current = 5,
Presence = 6,
Relay = 7,
}
impl SensorType {
pub fn from_u8(b: u8) -> Self {
match b {
1 => Self::Temperature,
2 => Self::Humidity,
3 => Self::Pressure,
4 => Self::Voltage,
5 => Self::Current,
6 => Self::Presence,
7 => Self::Relay,
_ => Self::Generic,
}
}
pub fn as_u8(self) -> u8 {
self as u8
}
/// Lowercase label used as a Prometheus label value.
pub fn label_str(self) -> &'static str {
match self {
Self::Generic => "generic",
Self::Temperature => "temperature",
Self::Humidity => "humidity",
Self::Pressure => "pressure",
Self::Voltage => "voltage",
Self::Current => "current",
Self::Presence => "presence",
Self::Relay => "relay",
}
}
/// SI / engineering unit string for Grafana axis labels.
pub fn unit_str(self) -> &'static str {
match self {
Self::Generic => "",
Self::Temperature => "°C",
Self::Humidity => "%",
Self::Pressure => "hPa",
Self::Voltage => "V",
Self::Current => "A",
Self::Presence => "s",
Self::Relay => "state",
}
}
}
/// One sample (T1/T2 sensor reading or T3 actuator command/ack) on the wire.
///
/// Fixed 39-byte little-endian layout — same on x86_64 and aarch64 (the two
/// evaluation hosts), so encode/decode is effectively a memcpy.
///
/// ```text
/// offset size field
/// ------ ---- --------------------------
/// 0 16 device_id (UUID)
/// 16 2 sensor_id (u16)
/// 18 8 raw_value (f64)
/// 26 8 timestamp_us (u64)
/// 34 4 sequence_number (u32)
/// 38 1 sensor_type (u8 — `SensorType` discriminant)
/// ```
///
/// Field semantics:
/// - `device_id` — UUID of the originating device (or target, for T3 commands).
/// - `sensor_id` — logical sensor/actuator on that device (per-device index).
/// - `raw_value` — sensor reading (T1/T2) or actuator setpoint/feedback (T3).
/// - `timestamp_us` — capture time on the device clock for T1/T2; server-side
/// ack time on T3 replies.
/// - `sequence_number` — monotonic counter per `(device_id, sensor_id)` for
/// T1/T2; correlation id linking T3 command and ack.
/// - `sensor_type` — `SensorType` discriminant, decoded via `SensorType::from_u8`.
#[derive(Debug, Clone, Default, Copy, PartialEq)]
pub struct QuicMessage {
pub device_id: uuid::Uuid,
pub sensor_id: u16,
pub raw_value: f64,
pub timestamp_us: u64,
pub sequence_number: u32,
pub sensor_type: u8,
}
#[derive(Debug, thiserror::Error)]
pub enum WireError {
#[error("expected exactly {expected} bytes, got {got}")]
BadLength { expected: usize, got: usize },
}
impl QuicMessage {
/// Bytes on the wire — fixed-size, no length prefix.
pub const WIRE_SIZE: usize = 39;
pub fn encode_to(&self, buf: &mut [u8]) -> Result<(), WireError> {
if buf.len() != Self::WIRE_SIZE {
return Err(WireError::BadLength {
expected: Self::WIRE_SIZE,
got: buf.len(),
});
}
buf[0..16].copy_from_slice(self.device_id.as_bytes());
buf[16..18].copy_from_slice(&self.sensor_id.to_le_bytes());
buf[18..26].copy_from_slice(&self.raw_value.to_le_bytes());
buf[26..34].copy_from_slice(&self.timestamp_us.to_le_bytes());
buf[34..38].copy_from_slice(&self.sequence_number.to_le_bytes());
buf[38] = self.sensor_type;
Ok(())
}
pub fn to_bytes(&self) -> [u8; Self::WIRE_SIZE] {
let mut buf = [0u8; Self::WIRE_SIZE];
self.encode_to(&mut buf).expect("WIRE_SIZE buffer is exactly sized");
buf
}
pub fn decode(buf: &[u8]) -> Result<Self, WireError> {
if buf.len() != Self::WIRE_SIZE {
return Err(WireError::BadLength {
expected: Self::WIRE_SIZE,
got: buf.len(),
});
}
let mut id_bytes = [0u8; 16];
id_bytes.copy_from_slice(&buf[0..16]);
Ok(Self {
device_id: uuid::Uuid::from_bytes(id_bytes),
sensor_id: u16::from_le_bytes(buf[16..18].try_into().unwrap()),
raw_value: f64::from_le_bytes(buf[18..26].try_into().unwrap()),
timestamp_us: u64::from_le_bytes(buf[26..34].try_into().unwrap()),
sequence_number: u32::from_le_bytes(buf[34..38].try_into().unwrap()),
sensor_type: buf[38],
})
}
/// Convenience accessor — decodes `sensor_type` to the typed enum.
pub fn typ(&self) -> SensorType {
SensorType::from_u8(self.sensor_type)
}
}
// --- Per-tier bridge senders -----------------------------------------------
//
// Three newtypes encode the paper's tier semantics into the type system so
// the demux can't mix them up:
//
// * T1 (datagrams) — lossy; `try_send` drops on full
// * T2 (uni streams) — reliable, ordered; `send().await` backpressures
// * T3 (bi streams) — reliable command + per-command oneshot reply
/// Tier 1 — high-frequency telemetry over QUIC datagrams. Full channel drops.
#[derive(Clone)]
pub struct T1Sender {
inner: mpsc::Sender<QuicMessage>,
}
impl T1Sender {
pub fn new(inner: mpsc::Sender<QuicMessage>) -> Self {
Self { inner }
}
/// Returns `true` if queued, `false` if dropped (channel full or closed).
pub fn send_lossy(&self, msg: QuicMessage) -> bool {
self.inner.try_send(msg).is_ok()
}
/// Currently queued messages — used for channel-depth gauges.
pub fn depth(&self) -> usize {
self.inner.max_capacity().saturating_sub(self.inner.capacity())
}
pub fn capacity(&self) -> usize {
self.inner.max_capacity()
}
}
/// Tier 2 — ordered events over a QUIC unidirectional stream. Awaits on full.
#[derive(Clone)]
pub struct T2Sender {
inner: mpsc::Sender<QuicMessage>,
}
impl T2Sender {
pub fn new(inner: mpsc::Sender<QuicMessage>) -> Self {
Self { inner }
}
pub async fn send(
&self,
msg: QuicMessage,
) -> Result<(), mpsc::error::SendError<QuicMessage>> {
self.inner.send(msg).await
}
pub fn depth(&self) -> usize {
self.inner.max_capacity().saturating_sub(self.inner.capacity())
}
pub fn capacity(&self) -> usize {
self.inner.max_capacity()
}
}
/// Outbound T3 — actuator setpoint the substrate sends to a connected device.
/// The `automation_system` constructs these; the tokio-side drain task builds
/// the full `QuicMessage` (assigns timestamp + sequence) and opens a bi-stream
/// to the target device.
#[derive(Debug, Clone, Copy)]
pub struct OutboundT3 {
pub target_device: uuid::Uuid,
pub sensor_id: u16,
pub raw_value: f64,
/// `SensorType` discriminant of the actuator (typically `Relay`).
pub sensor_type: u8,
}
#[derive(Clone)]
pub struct T3OutboundSender {
inner: mpsc::Sender<OutboundT3>,
}
impl T3OutboundSender {
pub fn new(inner: mpsc::Sender<OutboundT3>) -> Self {
Self { inner }
}
/// Non-blocking enqueue. Returns `Ok(())` on success; `Err` mirrors
/// tokio's `TrySendError` so callers can distinguish "full" from "closed".
pub fn try_send(
&self,
cmd: OutboundT3,
) -> Result<(), mpsc::error::TrySendError<OutboundT3>> {
self.inner.try_send(cmd)
}
pub fn depth(&self) -> usize {
self.inner.max_capacity().saturating_sub(self.inner.capacity())
}
pub fn capacity(&self) -> usize {
self.inner.max_capacity()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn wire_size_matches_fields() {
assert_eq!(QuicMessage::WIRE_SIZE, 16 + 2 + 8 + 8 + 4 + 1);
}
#[test]
fn roundtrip_preserves_all_fields() {
let msg = QuicMessage {
device_id: uuid::Uuid::from_u128(0x0123456789abcdef_fedcba9876543210),
sensor_id: 0xBEEF,
raw_value: -273.15,
timestamp_us: 1_700_000_000_000_001,
sequence_number: 42,
sensor_type: SensorType::Temperature.as_u8(),
};
let bytes = msg.to_bytes();
assert_eq!(bytes.len(), QuicMessage::WIRE_SIZE);
let decoded = QuicMessage::decode(&bytes).unwrap();
assert_eq!(msg, decoded);
assert_eq!(decoded.typ(), SensorType::Temperature);
}
#[test]
fn decode_rejects_wrong_length() {
assert!(matches!(
QuicMessage::decode(&[0u8; 38]),
Err(WireError::BadLength { expected: 39, got: 38 })
));
assert!(matches!(
QuicMessage::decode(&[0u8; 40]),
Err(WireError::BadLength { expected: 39, got: 40 })
));
}
#[test]
fn encode_layout_is_little_endian() {
let msg = QuicMessage {
device_id: uuid::Uuid::nil(),
sensor_id: 0x0102,
raw_value: 0.0,
timestamp_us: 0,
sequence_number: 0x04030201,
sensor_type: SensorType::Humidity.as_u8(),
};
let bytes = msg.to_bytes();
assert_eq!(&bytes[16..18], &[0x02, 0x01]);
assert_eq!(&bytes[34..38], &[0x01, 0x02, 0x03, 0x04]);
assert_eq!(bytes[38], SensorType::Humidity.as_u8());
}
#[test]
fn unknown_sensor_type_decodes_as_generic() {
assert_eq!(SensorType::from_u8(0), SensorType::Generic);
assert_eq!(SensorType::from_u8(99), SensorType::Generic);
assert_eq!(SensorType::from_u8(255), SensorType::Generic);
}
}

View File

@@ -0,0 +1,470 @@
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::{Arc, RwLock};
use std::time::Instant;
use anyhow::{Context, anyhow};
use metrics::{counter, histogram};
use quinn::{
Connection, Endpoint, Incoming, RecvStream, ServerConfig, StreamId, TransportConfig,
};
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::config::QuicConfig;
use crate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender};
/// Maps each known device UUID to the QUIC `Connection` that hosts it.
/// Several UUIDs typically point at the same `Connection` (one simulator
/// process commonly represents multiple virtual devices). `quinn::Connection`
/// is internally `Arc`-backed so cloning is cheap.
///
/// Held inside an `Arc<RwLock<…>>` so the tokio readers can register on first
/// message and `drain_outbound_t3` can look up routes at automation cadence.
/// Critical sections are tiny sync map ops — no `.await` while the lock is
/// held — so `std::sync::RwLock` is the right choice over `tokio::sync::*`.
pub type ConnectionRegistry = Arc<RwLock<HashMap<Uuid, Connection>>>;
pub fn new_connection_registry() -> ConnectionRegistry {
Arc::new(RwLock::new(HashMap::new()))
}
/// Insert (device → connection) if absent. Idempotent so it can be called
/// per-message without measurable cost on the hot ingest path.
fn ensure_registered(registry: &ConnectionRegistry, device_id: Uuid, conn: &Connection) {
let need_insert = {
let guard = registry.read().unwrap();
!guard.contains_key(&device_id)
};
if need_insert {
registry
.write()
.unwrap()
.entry(device_id)
.or_insert_with(|| conn.clone());
}
}
/// Datagram receive buffer in bytes. Sized to absorb microbursts at the
/// telemetry rates.
const DATAGRAM_RECV_BUFFER_BYTES: usize = 256 * 1024;
/// Load the cert chain + private key from disk and build a Quinn `ServerConfig`.
pub fn build_server_config(cfg: &QuicConfig) -> anyhow::Result<ServerConfig> {
let cert_pem = std::fs::read(&cfg.server_cert)
.with_context(|| format!("read server_cert at {}", cfg.server_cert))?;
let key_pem = std::fs::read(&cfg.server_key)
.with_context(|| format!("read server_key at {}", cfg.server_key))?;
let certs: Vec<CertificateDer<'static>> = rustls_pemfile::certs(&mut cert_pem.as_slice())
.collect::<Result<_, _>>()
.with_context(|| format!("parse PEM certs at {}", cfg.server_cert))?;
if certs.is_empty() {
return Err(anyhow!("no certificates found in {}", cfg.server_cert));
}
let key: PrivateKeyDer<'static> = rustls_pemfile::private_key(&mut key_pem.as_slice())
.with_context(|| format!("parse PEM key at {}", cfg.server_key))?
.ok_or_else(|| anyhow!("no private key found in {}", cfg.server_key))?;
let mut server_config =
ServerConfig::with_single_cert(certs, key).context("build Quinn ServerConfig")?;
// Explicit transport config so the values driving evaluation are visible
// in source and at startup, not buried in Quinn's defaults.
let mut transport = TransportConfig::default();
transport.datagram_receive_buffer_size(Some(DATAGRAM_RECV_BUFFER_BYTES));
server_config.transport = Arc::new(transport);
tracing::info!(
datagram_recv_buffer_bytes = DATAGRAM_RECV_BUFFER_BYTES,
"Quinn TransportConfig tuned"
);
Ok(server_config)
}
/// Bind the listener. Must be called from inside a tokio runtime context
/// (Quinn relies on `Handle::current()` internally).
pub fn bind_endpoint(cfg: &QuicConfig) -> anyhow::Result<Endpoint> {
let server_config = build_server_config(cfg)?;
let addr: SocketAddr = format!("{}:{}", cfg.server_interface, cfg.server_port)
.parse()
.with_context(|| {
format!(
"invalid bind address {}:{}",
cfg.server_interface, cfg.server_port
)
})?;
Endpoint::server(server_config, addr).context("Endpoint::server bind")
}
/// Accept loop. Owns the outbound-T3 drain task and the connection registry,
/// then clones per-connection state into `handle_incoming` for orchestration.
///
/// The drain task is spawned exactly once for the lifetime of the listener;
/// it routes ECS-issued `OutboundT3` commands to the right connection by
/// looking up `target_device` in the registry that `handle_incoming` populates.
///
/// Tier semantics: T1 datagrams + T2 uni streams come *in* from devices;
/// T3 bi streams are server-initiated for actuator commands and go *out*
/// via `drain_outbound_t3`. Devices never open bi streams to the substrate.
///
/// If `synthetic_t3_rate_hz > 0`, a bench-only task drives toggling Relay
/// commands at that rate through the same outbound channel — used by the
/// cross-tier isolation benchmark.
pub async fn accept_loop(
endpoint: Endpoint,
t1: T1Sender,
t2: T2Sender,
registry: ConnectionRegistry,
outbound_rx: mpsc::Receiver<OutboundT3>,
outbound_tx: mpsc::Sender<OutboundT3>,
synthetic_t3_rate_hz: f64,
) {
tracing::info!(local = ?endpoint.local_addr().ok(), "QUIC accept loop running");
tokio::spawn(drain_outbound_t3(registry.clone(), outbound_rx));
if synthetic_t3_rate_hz > 0.0 {
tracing::info!(rate_hz = synthetic_t3_rate_hz, "synthetic T3 driver enabled");
tokio::spawn(synthetic_t3_driver(
registry.clone(),
outbound_tx.clone(),
synthetic_t3_rate_hz,
));
}
drop(outbound_tx);
while let Some(incoming) = endpoint.accept().await {
let t1 = t1.clone();
let t2 = t2.clone();
let registry = registry.clone();
tokio::spawn(handle_incoming(incoming, t1, t2, registry));
}
tracing::info!("QUIC accept loop exited");
}
/// Bench-only synthetic T3 driver. Round-robins over every registered device,
/// pushing a toggling Relay setpoint through the outbound channel at the
/// configured rate. Exercises the same code path as `automation_system`, so
/// the cross-tier-isolation bench measures the real path.
async fn synthetic_t3_driver(
registry: ConnectionRegistry,
tx: mpsc::Sender<OutboundT3>,
rate_hz: f64,
) {
let period = std::time::Duration::from_nanos((1.0e9 / rate_hz) as u64);
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
let mut next_value = 1.0;
loop {
ticker.tick().await;
// Snapshot device list under read lock; release before doing async work.
let devices: Vec<Uuid> = registry.read().unwrap().keys().copied().collect();
if devices.is_empty() {
continue;
}
for device in devices {
let cmd = OutboundT3 {
target_device: device,
sensor_id: 6,
raw_value: next_value,
sensor_type: SensorType::Relay.as_u8(),
};
if tx.try_send(cmd).is_err() {
counter!("substrate_t3_outbound_dropped_total").increment(1);
}
}
// Toggle for the next round so we exercise both setpoints.
next_value = if next_value > 0.5 { 0.0 } else { 1.0 };
}
}
/// Per-connection orchestrator. Performs the handshake and spawns the T1
/// datagram + T2 uni-stream readers; T3 outbound is handled connection-wide
/// by `drain_outbound_t3`. Waits for the connection to close, then purges
/// the registry and joins the inbound readers.
async fn handle_incoming(
incoming: Incoming,
t1: T1Sender,
t2: T2Sender,
registry: ConnectionRegistry,
) {
let conn = match incoming.await {
Ok(c) => c,
Err(e) => {
tracing::warn!(error = %e, "handshake failed");
return;
}
};
let remote = conn.remote_address();
let stable_id = conn.stable_id();
tracing::info!(?remote, stable_id, "connection established");
let dgram_task = tokio::spawn(read_datagrams(conn.clone(), t1, registry.clone()));
let uni_task = tokio::spawn(read_uni_streams(conn.clone(), t2, registry.clone()));
let _ = conn.closed().await;
// Purge every device UUID that pointed at this connection. Cheap: 7 entries
// for an industrial-profile simulator, occasional disconnect.
registry
.write()
.unwrap()
.retain(|_, c| c.stable_id() != stable_id);
if let Err(e) = dgram_task.await {
tracing::warn!(?remote, error = %e, "T1 datagram task ended unexpectedly");
}
if let Err(e) = uni_task.await {
tracing::warn!(?remote, error = %e, "T2 uni stream task ended unexpectedly");
}
tracing::info!(?remote, "connection closed");
}
/// T1 — read QUIC datagrams, decode each as a fixed-size `QuicMessage`, push
/// into the lossy T1 channel. Registers the sending device in the connection
/// registry on first sight so outbound T3 commands can find this connection.
async fn read_datagrams(conn: Connection, t1: T1Sender, registry: ConnectionRegistry) {
let remote = conn.remote_address();
let mut received: u64 = 0;
let mut dropped: u64 = 0;
let mut decode_errors: u64 = 0;
loop {
match conn.read_datagram().await {
Ok(bytes) => match QuicMessage::decode(&bytes[..]) {
Ok(msg) => {
received += 1;
counter!("substrate_received_total", "tier" => "t1").increment(1);
ensure_registered(&registry, msg.device_id, &conn);
if !t1.send_lossy(msg) {
dropped += 1;
counter!("substrate_dropped_total", "tier" => "t1").increment(1);
tracing::trace!(?remote, "T1 channel full, datagram dropped");
}
}
Err(e) => {
decode_errors += 1;
counter!("substrate_decode_errors_total", "tier" => "t1").increment(1);
tracing::warn!(
?remote,
len = bytes.len(),
error = %e,
"T1 datagram decode failed"
);
}
},
Err(e) => {
tracing::debug!(
?remote,
received,
dropped,
decode_errors,
error = %e,
"T1 datagram reader ended"
);
return;
}
}
}
}
/// T2 — accept unidirectional streams. Each accepted stream gets its own task
/// reading 38-byte chunks until EOF (one stream may carry one event or many).
/// Cross-stream interleaving is allowed; ordering is only guaranteed *within*
/// a stream, matching QUIC's stream semantics.
async fn read_uni_streams(conn: Connection, t2: T2Sender, registry: ConnectionRegistry) {
let remote = conn.remote_address();
let mut streams_accepted: u64 = 0;
loop {
let recv = match conn.accept_uni().await {
Ok(s) => s,
Err(e) => {
tracing::debug!(
?remote,
streams_accepted,
error = %e,
"T2 uni accept loop ended"
);
return;
}
};
streams_accepted += 1;
let t2 = t2.clone();
let conn = conn.clone();
let registry = registry.clone();
tokio::spawn(read_one_uni_stream(remote, recv, t2, conn, registry));
}
}
/// Per-stream worker for T2. Reads fixed-size `QuicMessage`s back-to-back,
/// awaits backpressure on the T2 channel, and resets the stream on a decode
/// failure (one corrupt stream shouldn't take down the whole connection).
async fn read_one_uni_stream(
remote: SocketAddr,
mut recv: RecvStream,
t2: T2Sender,
conn: Connection,
registry: ConnectionRegistry,
) {
let stream_id: StreamId = recv.id();
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
let mut count: u64 = 0;
loop {
match recv.read_exact(&mut buf).await {
Ok(()) => match QuicMessage::decode(&buf) {
Ok(msg) => {
count += 1;
counter!("substrate_received_total", "tier" => "t2").increment(1);
ensure_registered(&registry, msg.device_id, &conn);
if t2.send(msg).await.is_err() {
// T2 receiver dropped (substrate shutting down).
tracing::warn!(
?remote,
?stream_id,
count,
"T2 channel closed; abandoning stream"
);
return;
}
}
Err(e) => {
counter!("substrate_decode_errors_total", "tier" => "t2").increment(1);
tracing::warn!(
?remote,
?stream_id,
count,
error = %e,
"T2 decode failed; resetting stream"
);
let _ = recv.stop(0u32.into());
return;
}
},
Err(e) => {
tracing::trace!(
?remote,
?stream_id,
count,
error = %e,
"T2 uni stream ended"
);
return;
}
}
}
}
/// T3 outbound drain — the substrate side of the actuator-command path.
///
/// Pops `OutboundT3` items the ECS produced, looks up the target device's
/// connection in the registry, and **spawns one tokio task per command** to
/// do the actual `open_bi() → write → finish → read_ack` round-trip. The
/// drain task itself never blocks on a per-command await, so a single stuck
/// `read_exact` (e.g. peer dropping mid-stream while Quinn's idle timeout
/// counts down) cannot stall the pipeline.
///
/// Per-stream task records `substrate_latency_us{tier="t3"}` from
/// `open_bi()` start to ack-receipt and increments
/// `substrate_received_total{tier="t3"}` on success.
///
/// Per-`(device, sensor)` sequence numbers are owned here so the wire-level
/// concerns stay out of the ECS.
async fn drain_outbound_t3(registry: ConnectionRegistry, mut rx: mpsc::Receiver<OutboundT3>) {
let mut seq_by_target: HashMap<(Uuid, u16), u32> = HashMap::new();
while let Some(cmd) = rx.recv().await {
let conn = match registry.read().unwrap().get(&cmd.target_device).cloned() {
Some(c) => c,
None => {
counter!("substrate_t3_outbound_no_route_total").increment(1);
tracing::debug!(
device = %cmd.target_device,
"outbound T3: no route, dropping"
);
continue;
}
};
let key = (cmd.target_device, cmd.sensor_id);
let seq = {
let s = seq_by_target.entry(key).or_insert(0);
let v = *s;
*s = s.wrapping_add(1);
v
};
let msg = QuicMessage {
device_id: cmd.target_device,
sensor_id: cmd.sensor_id,
raw_value: cmd.raw_value,
timestamp_us: now_us(),
sequence_number: seq,
sensor_type: cmd.sensor_type,
};
// One task per command. Concurrent in-flight bi-streams are
// first-class in QUIC, and this keeps the channel-drain loop hot.
tokio::spawn(async move {
let started = Instant::now();
match send_outbound_t3(&conn, &msg).await {
Ok(ack) => {
let elapsed_us = started.elapsed().as_micros() as f64;
histogram!("substrate_latency_us", "tier" => "t3").record(elapsed_us);
counter!("substrate_received_total", "tier" => "t3").increment(1);
tracing::trace!(
device = %msg.device_id,
sensor_id = msg.sensor_id,
raw = msg.raw_value,
ack_raw = ack.raw_value,
elapsed_us,
"outbound T3 completed"
);
}
Err(e) => {
counter!("substrate_t3_outbound_errors_total").increment(1);
tracing::warn!(
device = %msg.device_id,
sensor_id = msg.sensor_id,
error = %e,
"outbound T3 failed"
);
}
}
});
}
tracing::info!("outbound T3 drain task exited");
}
/// Single substrate-initiated T3 round-trip: open bi-stream, write command,
/// finish send half, read 39-byte ack, decode.
async fn send_outbound_t3(conn: &Connection, cmd: &QuicMessage) -> anyhow::Result<QuicMessage> {
let (mut send, mut recv) = conn.open_bi().await.context("open_bi for outbound T3")?;
send.write_all(&cmd.to_bytes())
.await
.context("write outbound T3 command")?;
send.finish().context("finish outbound T3 send half")?;
let mut buf = [0u8; QuicMessage::WIRE_SIZE];
recv.read_exact(&mut buf)
.await
.context("read outbound T3 ack")?;
QuicMessage::decode(&buf).context("decode outbound T3 ack")
}
fn now_us() -> u64 {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_micros() as u64)
.unwrap_or(0)
}

View File

@@ -0,0 +1,13 @@
use bevy::prelude::States;
/// Lifecycle of the QUIC listener inside the ECS schedule.
///
/// `Starting` is the default; `OnEnter(Starting)` performs the bind and, on
/// success, transitions to `Started`. A `Failed` variant will join when we
/// add proper error surfacing — for now a bind failure panics the app.
#[derive(States, Debug, Clone, Copy, Default, Eq, PartialEq, Hash)]
pub enum ServerState {
#[default]
Starting,
Started,
}

View File

@@ -0,0 +1,99 @@
//! Components attached to per-sensor entities, plus the per-type threshold
//! table used by `simulation_system`'s crossing detection.
//!
//! Each (device, sensor) pair becomes one entity tagged with `Asset` and
//! carrying `DeviceId` + `SensorId` + `SensorTypeTag` + `RawSensorData` +
//! `SmoothedValue`.
use bevy::prelude::*;
use crate::transport::SensorType;
/// Marker — every (device, sensor) pair becomes one entity tagged `Asset`.
#[derive(Component, Debug, Default, Clone, Copy)]
pub struct Asset;
#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct DeviceId(pub uuid::Uuid);
#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct SensorId(pub u16);
/// Sensor type — set on entity creation from the first message that names
/// the (device, sensor) pair, then immutable. We don't track type changes:
/// a given (device_id, sensor_id) is one logical sensor with one type for
/// the lifetime of the run.
#[derive(Component, Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct SensorTypeTag(pub SensorType);
/// Latest reading from this (device, sensor). Updated in place by
/// `ingest_system`; read by simulation/export/diagnostics.
#[derive(Component, Debug, Default, Clone, Copy, PartialEq)]
pub struct RawSensorData {
pub raw_value: f64,
pub timestamp_us: u64,
pub sequence_number: u32,
}
pub const SMOOTHED_WINDOW: usize = 16;
/// Rolling-window mean of the last `SMOOTHED_WINDOW` raw readings, plus a
/// hysteresis flag for threshold-crossing detection. Maintained by
/// `simulation_system` — this is the bit of the ECS that does honest
/// digital-twin transform work, not just write-through of incoming samples.
#[derive(Component, Debug, Clone, Copy)]
pub struct SmoothedValue {
ring: [f64; SMOOTHED_WINDOW],
head: usize,
filled: u16,
pub mean: f64,
pub above_threshold: bool,
}
impl Default for SmoothedValue {
fn default() -> Self {
Self {
ring: [0.0; SMOOTHED_WINDOW],
head: 0,
filled: 0,
mean: 0.0,
above_threshold: false,
}
}
}
impl SmoothedValue {
/// Push a new sample. Non-finite values (NaN / ±∞) are ignored — the
/// smoothed state stays whatever it was. This matters because T3 acks
/// can carry NaN when the substrate has never seen the target sensor.
pub fn push(&mut self, v: f64) {
if !v.is_finite() {
return;
}
self.ring[self.head] = v;
self.head = (self.head + 1) % SMOOTHED_WINDOW;
if (self.filled as usize) < SMOOTHED_WINDOW {
self.filled += 1;
}
let n = self.filled as usize;
let sum: f64 = self.ring.iter().take(n).sum();
self.mean = sum / n as f64;
}
}
/// Per-type threshold for `simulation_system`'s crossing detection. Chosen
/// mid-band against the simulator's waveforms so crossings actually fire
/// during a demo; in a real deployment these would be alarm thresholds
/// supplied by config.
pub(super) fn threshold_for(t: SensorType) -> f64 {
match t {
SensorType::Generic => 0.0,
SensorType::Temperature => 22.0, // °C — simulator oscillates 15..25
SensorType::Humidity => 55.0, // % — 30..70
SensorType::Pressure => 1014.0, // hPa — 1008..1018
SensorType::Voltage => 230.5, // V — 229..231
SensorType::Current => 10.5, // A — 8..12
SensorType::Presence => 1.0, // s — Trigger threshold
SensorType::Relay => f64::INFINITY, // Actuator state, no threshold
}
}

View File

@@ -0,0 +1,53 @@
//! ECS world: the five paper-named systems plus the components and resources
//! they operate on.
//!
//! ```text
//! components.rs ── per-sensor components + per-type threshold table
//! resources.rs ── SensorRegistry, DiagnosticsState, ExportSampleState
//! systems.rs ── ingest / simulation / export / diagnostics
//! tests.rs ── unit tests (#[cfg(test)] only)
//! ```
//!
//! Each (device, sensor) pair becomes one entity with `Asset` + `DeviceId` +
//! `SensorId` + `SensorTypeTag` + `RawSensorData` + `SmoothedValue`.
//! `ingest_system` upserts on every incoming `QuicMessage`; the registry maps
//! `(Uuid, u16) → Entity` for O(1) lookup.
mod components;
mod resources;
mod systems;
#[cfg(test)]
mod tests;
use bevy::prelude::*;
use bevy::state::condition::in_state;
use crate::transport::state::ServerState;
pub use components::{
Asset, DeviceId, RawSensorData, SMOOTHED_WINDOW, SensorId, SensorTypeTag, SmoothedValue,
};
pub use resources::SensorRegistry;
pub struct WorldPlugin;
impl Plugin for WorldPlugin {
fn build(&self, app: &mut App) {
app.init_resource::<SensorRegistry>()
.init_resource::<resources::DiagnosticsState>()
.init_resource::<resources::ExportSampleState>()
.add_systems(
PreUpdate,
systems::ingest_system.run_if(in_state(ServerState::Started)),
)
.add_systems(
Update,
(systems::simulation_system, systems::automation_system).chain(),
)
.add_systems(
PostUpdate,
(systems::export_system, systems::diagnostics_system).chain(),
);
}
}

View File

@@ -0,0 +1,48 @@
//! Bevy `Resource`s consumed by the world's systems.
use std::collections::HashMap;
use std::time::Instant;
use bevy::prelude::{Entity, Resource};
/// O(1) lookup `(device_id, sensor_id) → Entity`. Populated lazily by the
/// ingest system; queried by export/diagnostics.
#[derive(Resource, Default)]
pub struct SensorRegistry {
pub(crate) map: HashMap<(uuid::Uuid, u16), Entity>,
}
impl SensorRegistry {
pub fn entity_count(&self) -> usize {
self.map.len()
}
}
/// Rolling counter of ticks since the last `diagnostics` log line was emitted.
#[derive(Resource)]
pub(super) struct DiagnosticsState {
pub(super) last_log: Instant,
pub(super) ticks_since_log: u64,
}
impl Default for DiagnosticsState {
fn default() -> Self {
Self {
last_log: Instant::now(),
ticks_since_log: 0,
}
}
}
/// Rate-limiter for `export_system` — runs at the ECS tick rate but only
/// emits gauges once per second.
#[derive(Resource)]
pub(super) struct ExportSampleState {
pub(super) last_sample: Instant,
}
impl Default for ExportSampleState {
fn default() -> Self {
Self { last_sample: Instant::now() }
}
}

View File

@@ -0,0 +1,318 @@
//! The five paper-named ECS systems and their private helpers.
//!
//! Scheduler placement (configured in [`super::WorldPlugin`]):
//!
//! | Schedule | Systems |
//! |-----------|--------------------------------------|
//! | PreUpdate | ingest |
//! | Update | simulation |
//! | PostUpdate| export → diagnostics |
use std::collections::HashMap;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use bevy::prelude::*;
use metrics::{counter, gauge, histogram};
use tokio::sync::mpsc::error::TrySendError;
use crate::transport::ecs::{BridgeReceivers, BridgeSenders};
use crate::transport::{OutboundT3, QuicMessage, SensorType};
use super::components::{
Asset, DeviceId, RawSensorData, SensorId, SensorTypeTag, SmoothedValue, threshold_for,
};
use super::resources::{DiagnosticsState, ExportSampleState, SensorRegistry};
/// T1 batch limit per tick. Anything beyond this stays in the channel and
/// either drains next tick or gets dropped on full (T1's contract is lossy).
const T1_INGEST_BATCH: usize = 1024;
const T2_INGEST_BATCH: usize = 512;
/// Drain the two inbound tier channels (T1 datagrams, T2 uni streams) into
/// ECS state. T1 is bounded-batch and lossy; T2 is fully drained per tick.
/// T3 is *outbound* (substrate → device, actuator commands) and lives in
/// the tokio runtime — see `transport::server::drain_outbound_t3`.
pub(super) fn ingest_system(
bridge: Res<BridgeReceivers>,
mut registry: ResMut<SensorRegistry>,
mut commands: Commands,
mut q: Query<&mut RawSensorData>,
) {
let now = now_us();
// T1 — datagrams.
{
let mut t1 = bridge.t1.lock().unwrap();
for _ in 0..T1_INGEST_BATCH {
match t1.try_recv() {
Ok(msg) => {
histogram!("substrate_latency_us", "tier" => "t1")
.record(now.saturating_sub(msg.timestamp_us) as f64);
upsert_reading(&mut registry, &mut commands, &mut q, msg);
}
Err(_) => break,
}
}
}
// T2 — uni streams.
{
let mut t2 = bridge.t2.lock().unwrap();
for _ in 0..T2_INGEST_BATCH {
match t2.try_recv() {
Ok(msg) => {
histogram!("substrate_latency_us", "tier" => "t2")
.record(now.saturating_sub(msg.timestamp_us) as f64);
upsert_reading(&mut registry, &mut commands, &mut q, msg);
}
Err(_) => break,
}
}
}
}
fn upsert_reading(
registry: &mut SensorRegistry,
commands: &mut Commands,
q: &mut Query<&mut RawSensorData>,
msg: QuicMessage,
) {
let key = (msg.device_id, msg.sensor_id);
let data = RawSensorData {
raw_value: msg.raw_value,
timestamp_us: msg.timestamp_us,
sequence_number: msg.sequence_number,
};
if let Some(&entity) = registry.map.get(&key) {
// Common case: existing entity, mutate in place.
if let Ok(mut existing) = q.get_mut(entity) {
*existing = data;
} else {
// Edge case: entity was registered earlier in *this* tick via
// `commands.spawn`, so the components aren't in the archetype
// yet (`Commands` is deferred). Queue another insert; last write
// wins when Commands flushes.
commands.entity(entity).insert(data);
}
return;
}
let entity = commands
.spawn((
Asset,
DeviceId(msg.device_id),
SensorId(msg.sensor_id),
SensorTypeTag(SensorType::from_u8(msg.sensor_type)),
SmoothedValue::default(),
data,
))
.id();
registry.map.insert(key, entity);
}
/// Closed-loop automation: Presence threshold crossings trigger a T3 actuator
/// command going *out* to the originating device (substrate → simulator), and
/// a parallel local Relay-entity update so the operator dashboard reflects the
/// dispatched setpoint immediately (Grafana panels read the local ECS state).
///
/// The Relay actuator id is fixed at `6` in the industrial profile — see
/// `simulator/src/profile.rs::build_slots`.
const RELAY_SENSOR_ID: u16 = 6;
pub(super) fn automation_system(
senders: Res<BridgeSenders>,
mut registry: ResMut<SensorRegistry>,
mut commands: Commands,
mut p: ParamSet<(
Query<(&DeviceId, &SensorTypeTag, &RawSensorData), Changed<RawSensorData>>,
Query<&mut RawSensorData>,
)>,
) {
let mut triggers = Vec::new();
for (dev_id, tag, data) in p.p0().iter() {
if tag.0 == SensorType::Presence {
// Presence > 1.0 s ⇒ no occupancy detected ⇒ motor may run (relay 0).
// Presence < 1.0 s ⇒ occupancy detected ⇒ stop motor (relay 1).
let relay_state = if data.raw_value < 1.0 { 1.0 } else { 0.0 };
triggers.push((dev_id.0, relay_state));
}
}
let mut q = p.p1();
for (device_id, relay_state) in triggers {
// 1) Dispatch the real actuator command to the device over T3.
let cmd = OutboundT3 {
target_device: device_id,
sensor_id: RELAY_SENSOR_ID,
raw_value: relay_state,
sensor_type: SensorType::Relay.as_u8(),
};
match senders.t3_out.try_send(cmd) {
Ok(()) => {}
Err(TrySendError::Full(_)) => {
counter!("substrate_t3_outbound_dropped_total").increment(1);
tracing::warn!(device = %device_id, "outbound T3 channel full; setpoint dropped");
}
Err(TrySendError::Closed(_)) => {
// Drain task is gone — substrate shutting down. Quiet log.
tracing::debug!("outbound T3 channel closed");
}
}
// 2) Mirror the setpoint into the local Relay entity so the dashboard
// sees automation activity without waiting for the device ack.
let mirror = QuicMessage {
device_id,
sensor_id: RELAY_SENSOR_ID,
raw_value: relay_state,
timestamp_us: now_us(),
sequence_number: 0,
sensor_type: SensorType::Relay.as_u8(),
};
upsert_reading(&mut registry, &mut commands, &mut q, mirror);
}
}
/// Per-sensor digital-twin transform. Pulls each entity's latest
/// `RawSensorData` into a sliding-window mean (`SmoothedValue`), and emits
/// `substrate_threshold_crossings_total{type, direction}` when that mean
/// transitions across the per-type threshold. The `Changed<RawSensorData>`
/// filter restricts the scan to entities updated *this tick*, so the cost
/// scales with ingress rate, not fleet size.
pub(super) fn simulation_system(
mut q: Query<(&SensorTypeTag, &RawSensorData, &mut SmoothedValue), Changed<RawSensorData>>,
) {
for (st, raw, mut smoothed) in q.iter_mut() {
smoothed.push(raw.raw_value);
let now_above = smoothed.mean > threshold_for(st.0);
if now_above != smoothed.above_threshold {
smoothed.above_threshold = now_above;
let dir = if now_above { "up" } else { "down" };
counter!(
"substrate_threshold_crossings_total",
"type" => st.0.label_str(),
"direction" => dir
)
.increment(1);
}
}
}
/// Sample ECS-side gauges into the Prometheus exporter. Runs every tick but
/// only emits once per second to keep cost negligible. This is the system
/// the paper's §Architecture diagram calls `ExportSystem`.
pub(super) fn export_system(
senders: Res<BridgeSenders>,
registry: Res<SensorRegistry>,
sensors_q: Query<(&SensorTypeTag, &RawSensorData)>,
mut state: ResMut<ExportSampleState>,
) {
let now = Instant::now();
if now.duration_since(state.last_sample) < Duration::from_secs(1) {
return;
}
state.last_sample = now;
// ---- runtime telemetry ----
gauge!("substrate_entities").set(registry.entity_count() as f64);
gauge!("substrate_channel_depth", "tier" => "t1").set(senders.t1.depth() as f64);
gauge!("substrate_channel_depth", "tier" => "t2").set(senders.t2.depth() as f64);
gauge!("substrate_channel_depth", "tier" => "t3").set(senders.t3_out.depth() as f64);
gauge!("substrate_channel_capacity", "tier" => "t1").set(senders.t1.capacity() as f64);
gauge!("substrate_channel_capacity", "tier" => "t2").set(senders.t2.capacity() as f64);
gauge!("substrate_channel_capacity", "tier" => "t3").set(senders.t3_out.capacity() as f64);
if let Some(stats) = memory_stats::memory_stats() {
gauge!("substrate_rss_bytes").set(stats.physical_mem as f64);
}
// ---- sensor data aggregates (per type) ----
let mut by_type: HashMap<&'static str, Aggregate> = HashMap::new();
for (st, data) in &sensors_q {
by_type
.entry(st.0.label_str())
.or_insert_with(Aggregate::new)
.push(data.raw_value);
}
for (label, agg) in &by_type {
gauge!("sensor_aggregate", "type" => *label, "stat" => "count").set(agg.count as f64);
if agg.count > 0 {
gauge!("sensor_aggregate", "type" => *label, "stat" => "mean").set(agg.mean());
gauge!("sensor_aggregate", "type" => *label, "stat" => "min").set(agg.min);
gauge!("sensor_aggregate", "type" => *label, "stat" => "max").set(agg.max);
}
}
}
pub(super) fn diagnostics_system(
mut state: ResMut<DiagnosticsState>,
registry: Res<SensorRegistry>,
) {
state.ticks_since_log += 1;
let now = Instant::now();
let elapsed = now.duration_since(state.last_log);
if elapsed >= Duration::from_secs(1) {
let tick_hz = state.ticks_since_log as f64 / elapsed.as_secs_f64();
gauge!("substrate_tick_hz").set(tick_hz);
tracing::info!(
tick_hz = format_args!("{:.1}", tick_hz),
entities = registry.entity_count(),
"diagnostics"
);
state.last_log = now;
state.ticks_since_log = 0;
}
}
fn now_us() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_micros() as u64)
.unwrap_or(0)
}
/// Per-type accumulator for `export_system`'s sensor aggregates. NaN-safe.
#[derive(Debug, Clone, Copy)]
struct Aggregate {
count: u64,
sum: f64,
min: f64,
max: f64,
}
impl Aggregate {
fn new() -> Self {
Self {
count: 0,
sum: 0.0,
min: f64::INFINITY,
max: f64::NEG_INFINITY,
}
}
fn push(&mut self, v: f64) {
if !v.is_finite() {
return;
}
self.count += 1;
self.sum += v;
if v < self.min {
self.min = v;
}
if v > self.max {
self.max = v;
}
}
fn mean(&self) -> f64 {
if self.count == 0 {
f64::NAN
} else {
self.sum / self.count as f64
}
}
}

View File

@@ -0,0 +1,288 @@
//! Unit tests for the world's components and systems.
//!
//! Lives as a child module so it can poke at `pub(super)` items (the
//! internal resources, `threshold_for`, etc.) without enlarging the
//! public API.
use std::sync::Mutex;
use bevy::prelude::*;
use bevy::state::app::StatesPlugin;
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::transport::ecs::{BridgeReceivers, BridgeSenders};
use crate::transport::state::ServerState;
use crate::transport::{OutboundT3, QuicMessage, SensorType, T1Sender, T2Sender, T3OutboundSender};
use super::WorldPlugin;
use super::components::{RawSensorData, SMOOTHED_WINDOW, SmoothedValue, threshold_for};
use super::resources::SensorRegistry;
/// Build a Bevy app with just enough plugins/resources to run the world
/// systems against test-owned channels. No QUIC, no tokio runtime.
///
/// Returns the app plus the T1/T2 send halves and the outbound-T3 receive
/// half — the latter so tests can observe `automation_system` dispatching.
fn make_test_app() -> (
App,
mpsc::Sender<QuicMessage>,
mpsc::Sender<QuicMessage>,
mpsc::Receiver<OutboundT3>,
) {
let (t1_tx, t1_rx) = mpsc::channel::<QuicMessage>(64);
let (t2_tx, t2_rx) = mpsc::channel::<QuicMessage>(64);
let (t3_out_tx, t3_out_rx) = mpsc::channel::<OutboundT3>(64);
let bridge = BridgeReceivers {
t1: Mutex::new(t1_rx),
t2: Mutex::new(t2_rx),
};
// export_system samples channel depth/capacity from the senders; it
// requires the resource even when the test pushes via the raw senders
// directly (which is what the rest of the test does).
let senders = BridgeSenders {
t1: T1Sender::new(t1_tx.clone()),
t2: T2Sender::new(t2_tx.clone()),
t3_out: T3OutboundSender::new(t3_out_tx),
};
let mut app = App::new();
app.add_plugins(MinimalPlugins)
.add_plugins(StatesPlugin)
.init_state::<ServerState>()
.insert_resource(bridge)
.insert_resource(senders)
.add_plugins(WorldPlugin);
// Force the state machine into Started so the run_if guard passes.
app.world_mut()
.resource_mut::<NextState<ServerState>>()
.set(ServerState::Started);
// Process the state transition before tests push messages.
app.update();
(app, t1_tx, t2_tx, t3_out_rx)
}
// ---- ingest_system: entity lifecycle ----
#[test]
fn ingest_t1_creates_entity_and_writes_raw_data() {
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::from_u128(0xa1a2_a3a4_a5a6_a7a8_a9aa_abac_adae_afb0);
let msg = QuicMessage {
device_id: device,
sensor_id: 5,
raw_value: 3.14,
timestamp_us: 1_700_000_000_000_001,
sequence_number: 1,
sensor_type: SensorType::Temperature.as_u8(),
};
t1_tx.try_send(msg).expect("channel cap");
// Tick 1: ingest drains the channel and spawns via Commands.
app.update();
// Tick 2: Commands have flushed into the archetype.
app.update();
let registry = app.world().resource::<SensorRegistry>();
assert_eq!(registry.map.len(), 1);
let entity = *registry
.map
.get(&(device, 5))
.expect("entity not registered");
let data = app
.world()
.get::<RawSensorData>(entity)
.expect("RawSensorData missing");
assert_eq!(data.raw_value, 3.14);
assert_eq!(data.sequence_number, 1);
assert_eq!(data.timestamp_us, 1_700_000_000_000_001);
}
#[test]
fn ingest_t1_repeated_messages_update_in_place() {
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
// First reading.
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: 1.0,
timestamp_us: 1,
sequence_number: 1,
sensor_type: SensorType::Generic.as_u8(),
})
.unwrap();
app.update();
app.update();
// Second reading on the same (device, sensor).
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: 2.0,
timestamp_us: 2,
sequence_number: 2,
sensor_type: SensorType::Generic.as_u8(),
})
.unwrap();
app.update();
let registry = app.world().resource::<SensorRegistry>();
assert_eq!(registry.map.len(), 1, "should reuse the same entity");
let entity = *registry.map.get(&(device, 0)).unwrap();
let data = app.world().get::<RawSensorData>(entity).unwrap();
assert_eq!(data.raw_value, 2.0);
assert_eq!(data.sequence_number, 2);
}
#[test]
fn automation_dispatches_relay_stop_when_presence_drops() {
// The automation_system runs after simulation_system, which only emits a
// crossing when the *smoothed* mean transitions; for this test we just
// confirm that a Presence reading below threshold ends up enqueued as an
// OutboundT3 Relay=stop command. Repeated below-threshold pushes prime
// the rolling mean.
let (mut app, t1_tx, _t2_tx, mut t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
for seq in 0..SMOOTHED_WINDOW as u32 {
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 5,
raw_value: 0.5, // below the 1.0 s threshold
timestamp_us: u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Presence.as_u8(),
})
.unwrap();
app.update();
app.update();
}
// Drain whatever automation dispatched. We expect at least one Relay=stop
// command targeting the device.
let mut saw_stop = false;
while let Ok(cmd) = t3_out_rx.try_recv() {
if cmd.target_device == device
&& cmd.sensor_type == SensorType::Relay.as_u8()
&& cmd.raw_value > 0.5
{
saw_stop = true;
}
}
assert!(
saw_stop,
"automation_system should have enqueued an outbound Relay=stop \
command for {device} after sustained sub-threshold Presence readings"
);
}
// ---- SmoothedValue unit tests ----
#[test]
fn smoothed_value_first_push_sets_mean() {
let mut s = SmoothedValue::default();
s.push(10.0);
assert_eq!(s.mean, 10.0);
assert!(!s.above_threshold);
}
#[test]
fn smoothed_value_averages_filled_window() {
let mut s = SmoothedValue::default();
for v in [1.0, 2.0, 3.0, 4.0] {
s.push(v);
}
assert!((s.mean - 2.5).abs() < 1e-9);
}
#[test]
fn smoothed_value_rolls_after_window_fills() {
let mut s = SmoothedValue::default();
for _ in 0..SMOOTHED_WINDOW {
s.push(0.0);
}
assert!((s.mean - 0.0).abs() < 1e-9);
for _ in 0..SMOOTHED_WINDOW {
s.push(10.0);
}
assert!((s.mean - 10.0).abs() < 1e-9, "ring should fully roll over");
}
#[test]
fn smoothed_value_ignores_nonfinite() {
let mut s = SmoothedValue::default();
s.push(5.0);
let before = s.mean;
s.push(f64::NAN);
s.push(f64::INFINITY);
s.push(f64::NEG_INFINITY);
assert_eq!(s.mean, before, "non-finite values should not perturb the mean");
}
// ---- simulation_system: end-to-end threshold-crossing transition ----
#[test]
fn simulation_smoothes_and_detects_threshold_crossing() {
let (mut app, t1_tx, _t2_tx, _t3_out_rx) = make_test_app();
let device = Uuid::new_v4();
let threshold = threshold_for(SensorType::Temperature); // 22.0 °C
// Below-threshold readings: smoothed mean stays under, no crossing.
for seq in 0..SMOOTHED_WINDOW as u32 {
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: 18.0,
timestamp_us: u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Temperature.as_u8(),
})
.unwrap();
app.update();
app.update();
}
let registry = app.world().resource::<SensorRegistry>();
let entity = *registry.map.get(&(device, 0)).unwrap();
let smoothed = app
.world()
.get::<SmoothedValue>(entity)
.expect("SmoothedValue should be on every sensor entity");
assert!(smoothed.mean < threshold);
assert!(!smoothed.above_threshold, "should not have crossed up yet");
// Above-threshold readings: enough samples to drag the mean above
// the threshold (window = 16; pushing 30°C for 16 ticks lands mean ≈ 30).
for seq in (SMOOTHED_WINDOW as u32)..(SMOOTHED_WINDOW as u32 * 2) {
t1_tx
.try_send(QuicMessage {
device_id: device,
sensor_id: 0,
raw_value: 30.0,
timestamp_us: u64::from(seq),
sequence_number: seq,
sensor_type: SensorType::Temperature.as_u8(),
})
.unwrap();
app.update();
}
let smoothed = app.world().get::<SmoothedValue>(entity).unwrap();
assert!(smoothed.mean > threshold);
assert!(
smoothed.above_threshold,
"smoothed mean should have crossed up through {threshold}"
);
}