Files
quic_ecs_dt/scripts/bench-scaling.sh
2026-05-13 15:03:23 -04:00

248 lines
9.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# scripts/bench-scaling.sh — M6-lite: sweep T1 rate at fixed entity count,
# record tick_hz / P99 latency / drops / RSS into a CSV the paper can plot.
#
# Two modes:
#
# 1. Scaling sweep (default). Just T1 traffic. Tells you the substrate's
# throughput ceiling on this host and where the lossy-tier kicks in.
# Output: data/local/scaling.csv
#
# 2. Cross-tier isolation. Set T3_RATE_HZ=<N> to enable the substrate's
# synthetic T3 driver (server-initiated Relay commands to every
# connected device at that rate) in parallel with the T1 sweep. The CSV
# gains substrate-side T3 latency columns. If T3 P99 stays flat as T1
# climbs orders of magnitude, the paper's composition thesis is supported.
# Output: data/local/cross_tier.csv
#
# Holds:
# - tick_rate_hz $TICK_RATE_HZ (default 1000; set 0 for busy-loop)
# - device count $DEVICES (default 100, single-sensor profile)
# - window $WINDOW_S (default 20s steady-state per rate)
# - T3 baseline $T3_RATE_HZ (default 0 = disabled)
# - build profile $BUILD (release | debug; default release)
#
# Sweeps:
# T1 rate over the positional arguments, or these defaults:
# 100 500 1000 5000 10000 25000 50000
#
# Examples:
# # Pure T1 scaling sweep.
# ./scripts/bench-scaling.sh
#
# # Cross-tier isolation: hold T3 at 100 Hz, sweep T1.
# T3_RATE_HZ=100 ./scripts/bench-scaling.sh
#
# # Custom sweep, longer windows.
# DEVICES=1000 WINDOW_S=30 ./scripts/bench-scaling.sh 1000 5000 20000
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$ROOT"
# --- knobs ---
DEVICES="${DEVICES:-100}"
TICK_RATE_HZ="${TICK_RATE_HZ:-1000}"
WARMUP_S="${WARMUP_S:-3}"
WINDOW_S="${WINDOW_S:-20}"
T3_RATE_HZ="${T3_RATE_HZ:-0}"
BUILD="${BUILD:-release}"
RATES=("${@}")
if [[ ${#RATES[@]} -eq 0 ]]; then
RATES=(100 500 1000 5000 10000 25000 50000)
fi
# Pick default output path based on mode so the two CSVs don't clobber.
CROSS_TIER=$(awk -v r="$T3_RATE_HZ" 'BEGIN { print (r+0 > 0) ? "1" : "0" }')
if [[ "$CROSS_TIER" == "1" ]]; then
OUT_CSV="${OUT_CSV:-data/local/cross_tier.csv}"
else
OUT_CSV="${OUT_CSV:-data/local/scaling.csv}"
fi
# --- pretty logging ---
if [[ -t 1 ]]; then
BOLD=$'\033[1m'; DIM=$'\033[2m'; GREEN=$'\033[32m'; RED=$'\033[31m'; RESET=$'\033[0m'
else BOLD=; DIM=; GREEN=; RED=; RESET=; fi
step() { printf '%s» %s%s\n' "$BOLD" "$1" "$RESET"; }
ok() { printf '%s ✓ %s%s\n' "$GREEN" "$1" "$RESET"; }
fail() { printf '%s ✗ %s%s\n' "$RED" "$1" "$RESET"; }
# --- prereqs ---
for cmd in cargo curl lsof awk; do
command -v "$cmd" >/dev/null || { fail "missing: $cmd"; exit 1; }
done
for port in 9000 9100; do
if lsof -nP -iUDP:$port -iTCP:$port -sTCP:LISTEN 2>/dev/null | grep -q LISTEN; then
fail "port $port in use — kill the running substrate first"
exit 1
fi
done
[[ -f certs/server.crt ]] || make certs >/dev/null
# --- build ---
step "Building ($BUILD)"
if [[ "$BUILD" == "release" ]]; then
cargo build --release -p substrate -p simulator >/dev/null
SUBSTRATE="$ROOT/target/release/substrate"
SIMULATOR="$ROOT/target/release/simulator"
else
cargo build -p substrate -p simulator >/dev/null
SUBSTRATE="$ROOT/target/debug/substrate"
SIMULATOR="$ROOT/target/debug/simulator"
fi
# --- start substrate with high tick rate ---
LOG_DIR="/tmp/quic_ecs_dt_bench"
mkdir -p "$LOG_DIR"
SUB_LOG="$LOG_DIR/substrate.log"
: > "$SUB_LOG"
step "Starting substrate (tick_rate_hz=$TICK_RATE_HZ, synthetic_t3=$T3_RATE_HZ Hz, log: $SUB_LOG)"
APP_SIMULATION__TICK_RATE_HZ="$TICK_RATE_HZ" \
APP_NETWORK__SYNTHETIC_T3_RATE_HZ="$T3_RATE_HZ" \
RUST_LOG=warn "$SUBSTRATE" >"$SUB_LOG" 2>&1 &
SUBSTRATE_PID=$!
# Wait for /metrics
for i in $(seq 1 40); do
if curl -sf http://localhost:9100/metrics >/dev/null 2>&1; then
ok "substrate /metrics ready"; break
fi
sleep 0.25
if [[ $i -eq 40 ]]; then fail "substrate didn't start"; tail -20 "$SUB_LOG"; exit 1; fi
done
cleanup() {
[[ -n "${SIM_PID:-}" ]] && kill -TERM "$SIM_PID" 2>/dev/null || true
[[ -n "${SUBSTRATE_PID:-}" ]] && kill -TERM "$SUBSTRATE_PID" 2>/dev/null || true
wait 2>/dev/null || true
}
trap cleanup EXIT INT TERM
# --- helpers to scrape a single value out of /metrics text ---
snapshot_to() {
curl -s http://localhost:9100/metrics > "$1"
}
get_value() {
# $1: snapshot file, $2: full metric name (regex-anchored at line start)
awk -v pat="$2" '$0 ~ "^" pat " " { print $NF; exit }' "$1"
}
# --- sweep ---
mkdir -p "$(dirname "$OUT_CSV")"
echo "rate_hz,t3_rate_hz,devices,tick_rate_hz,window_s,t1_received,t1_dropped,t1_p50_us,t1_p99_us,t1_p999_us,t3_received,t3_no_route,t3_p50_us,t3_p99_us,t3_p999_us,tick_hz,rss_mb,channel_depth_max" > "$OUT_CSV"
if [[ "$CROSS_TIER" == "1" ]]; then
step "Sweeping T1 + holding T3 at ${T3_RATE_HZ} Hz (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)"
else
step "Sweeping T1 rate (warmup ${WARMUP_S}s, window ${WINDOW_S}s, devices=$DEVICES)"
fi
printf '%s' "$BOLD"
if [[ "$CROSS_TIER" == "1" ]]; then
printf '%-8s %-9s %-9s %-10s %-10s %-8s %-9s %-10s %-10s %-8s %-7s\n' \
"rate" "t1_recv" "t1_drop" "t1_p50" "t1_p99" "t3_recv" "t3_p50" "t3_p99" "t3_p999" "tick_hz" "rss_mb"
else
printf '%-8s %-9s %-9s %-10s %-10s %-10s %-8s %-7s\n' \
"rate" "received" "dropped" "p50_us" "p99_us" "p999_us" "tick_hz" "rss_mb"
fi
printf '%s' "$RESET"
# Snapshot file paths
BEFORE="$LOG_DIR/before.txt"
AFTER="$LOG_DIR/after.txt"
# Peak-tracker for channel depth: tail /metrics at 4 Hz during the window
peak_depth() {
local label="$1" # "t1" or "t2" or "t3"
local max=0
local val
for _ in $(seq 1 $(( WINDOW_S * 4 ))); do
val=$(curl -s http://localhost:9100/metrics 2>/dev/null \
| awk -v pat="^substrate_channel_depth\\\\{tier=\"$label\"\\\\}" '$0 ~ pat {print $NF; exit}')
if [[ -n "$val" && "$val" != "0" ]]; then
# Compare numerically; bash can do integer compare via [[ ]]
int_val="${val%.*}"
if (( int_val > max )); then max=$int_val; fi
fi
sleep 0.25
done
echo "$max"
}
for rate in "${RATES[@]}"; do
# Launch simulator: T1 sweep only. In cross-tier mode the substrate's
# synthetic_t3 driver (enabled via env at startup) generates the T3
# traffic; the simulator just keeps the connection alive and pushes T1.
sim_args=(
--profile single
--sensor-type generic
--rate-hz "$rate"
--count 0
--devices "$DEVICES"
)
RUST_LOG=warn "$SIMULATOR" "${sim_args[@]}" >"$LOG_DIR/sim_${rate}.log" 2>&1 &
SIM_PID=$!
# Warmup, then snapshot counters at the start of the *measurement* window.
sleep "$WARMUP_S"
snapshot_to "$BEFORE"
rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t1"\}')
drop_before=$(get_value "$BEFORE" 'substrate_dropped_total\{tier="t1"\}')
t3_rec_before=$(get_value "$BEFORE" 'substrate_received_total\{tier="t3"\}')
t3_nr_before=$(get_value "$BEFORE" 'substrate_t3_outbound_no_route_total')
depth_max=$(peak_depth t1)
snapshot_to "$AFTER"
kill -TERM "$SIM_PID" 2>/dev/null || true
wait "$SIM_PID" 2>/dev/null || true
SIM_PID=""
rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t1"\}')
drop_after=$(get_value "$AFTER" 'substrate_dropped_total\{tier="t1"\}')
p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.5"\}')
p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.99"\}')
p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t1",quantile="0.999"\}')
t3_rec_after=$(get_value "$AFTER" 'substrate_received_total\{tier="t3"\}')
t3_nr_after=$(get_value "$AFTER" 'substrate_t3_outbound_no_route_total')
t3_p50=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.5"\}')
t3_p99=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.99"\}')
t3_p999=$(get_value "$AFTER" 'substrate_latency_us\{tier="t3",quantile="0.999"\}')
tick_hz=$(get_value "$AFTER" 'substrate_tick_hz')
rss=$(get_value "$AFTER" 'substrate_rss_bytes')
# Compute deltas + format. Use awk for floating math.
received=$(awk -v a="$rec_after" -v b="$rec_before" 'BEGIN { printf "%d", a-b }')
dropped=$(awk -v a="$drop_after" -v b="$drop_before" 'BEGIN { printf "%d", a-b }')
t3_received=$(awk -v a="$t3_rec_after" -v b="$t3_rec_before" 'BEGIN { printf "%d", a-b }')
t3_no_route=$(awk -v a="$t3_nr_after" -v b="$t3_nr_before" 'BEGIN { printf "%d", a-b }')
rss_mb=$(awk -v r="$rss" 'BEGIN { printf "%.1f", r/1048576 }')
tick_hz_fmt=$(awk -v t="$tick_hz" 'BEGIN { printf "%.1f", t }')
if [[ "$CROSS_TIER" == "1" ]]; then
printf '%-8s %-9s %-9s %-10.0f %-10.0f %-8s %-9.0f %-10.0f %-10.0f %-8s %-7s\n' \
"$rate" "$received" "$dropped" \
"${p50:-0}" "${p99:-0}" \
"$t3_received" "${t3_p50:-0}" "${t3_p99:-0}" "${t3_p999:-0}" \
"$tick_hz_fmt" "$rss_mb"
else
printf '%-8s %-9s %-9s %-10.0f %-10.0f %-10.0f %-8s %-7s\n' \
"$rate" "$received" "$dropped" "${p50:-0}" "${p99:-0}" "${p999:-0}" \
"$tick_hz_fmt" "$rss_mb"
fi
echo "$rate,$T3_RATE_HZ,$DEVICES,$TICK_RATE_HZ,$WINDOW_S,$received,$dropped,${p50:-0},${p99:-0},${p999:-0},$t3_received,$t3_no_route,${t3_p50:-0},${t3_p99:-0},${t3_p999:-0},$tick_hz_fmt,$rss_mb,$depth_max" >> "$OUT_CSV"
# Tiny breather between rate points so the substrate's summary window
# doesn't carry over.
sleep 1
done
printf '\n%sCSV written to:%s %s\n' "$DIM" "$RESET" "$OUT_CSV"
printf '%sSubstrate log:%s %s\n' "$DIM" "$RESET" "$SUB_LOG"