Files
tiennm99 55c873965c feat(observability): phase 06 — timing telemetry + soak analyzer + burst tester
Code prerequisites for the Phase 06 cold-start soak gate. The 24-72h soak
itself is operator-run; this commit ships the instrumentation + analysis
tools needed to make the PROCEED-or-PIVOT decision.

Telemetry
- src/util/timing.js: startTiming(cmd) returns {mark, end} that emits a
  structured cmd_timing log. takeColdFlag() returns {cold, isolateAgeMs}
  using a module-scoped boolean — first request in an isolate is cold,
  subsequent are warm. This replaces the originally-planned
  isolate_age_ms < 200ms classifier (broken because Mongo cold-connect
  itself is ~1500ms; cold requests would always bucket as warm —
  code-reviewer #11).
- src/util/request-context.js: setLastCold/getLastCold shared state
  bridges fetch-level cold detection into the dispatcher middleware
  without a circular import.
- src/index.js: takeColdFlag at the top of fetch() emits a request log
  and primes the request context for the dispatcher.
- src/modules/dispatcher.js: bot.use() middleware times every command.
  Chosen over per-handler wrapping to preserve the existing identity
  assertion in tests (handler === reg.allCommands.get(name).cmd.handler)
  — single instrumentation point, no contract change.

Soak tools (operator-run)
- scripts/analyze-soak.js: parses CF Logs export (NDJSON or CSV), filters
  cmd_timing events, computes p50/p95/p99 per (cmd, cold/warm). Counts
  dual-write secondary failures, mongo connection errors, CPU-time
  exceeded events. Writes markdown report.
- scripts/synthetic-burst.js: fires N parallel synthetic Telegram updates
  at the deployed Worker URL with cache-busting tokens. Used for the
  pre-deploy connection-cap stress test (debugger #2 — 20 parallel cold
  requests, abort if Atlas peak > 60% of 500-conn cap).
- package.json: analyze:soak + burst:synthetic scripts wired.

Tests
- tests/util/timing.test.js: 8 tests — timing semantics, cold flag flip.
- tests/scripts/analyze-soak.test.js: 22 tests — percentile math, NDJSON
  + CSV parse, aggregation, markdown formatting.

Tests: 667 → 697 (+30). Lint clean.

Operator runbook for Phase 06 (NOT executed by this commit):
1. Verify telemetry live via wrangler tail.
2. Run synthetic burst test: npm run burst:synthetic -- --url <prod>
3. Configure Atlas + CF Observability email alerts.
4. 24h soak (extend to 72h on stop-conditions per phase plan).
5. Daily npm run verify:mongo.
6. npm run analyze:soak -- --input <cf-logs.json> → soak-decision.md.
7. PROCEED to Phase 07 if cold-start P95 ≤ 2.5 × BASELINE_COLD_PING_MS;
   else execute phase-07-alt-pivot.md (Upstash standby).
2026-04-26 09:22:04 +07:00

149 lines
5.6 KiB
JavaScript

#!/usr/bin/env node
/**
* @file synthetic-burst — fire N parallel requests at the deployed Worker URL
* to exercise the M0 Atlas connection cap before a live deploy.
*
* Each request is a synthetic Telegram webhook update POST that grammY will
* route to the specified command handler. All requests hit the Worker
* simultaneously (Promise.all) to maximise cold-isolate spawning.
*
* Usage:
* node scripts/synthetic-burst.js \
* --url https://miti99bot.workers.dev \
* --secret <X-Telegram-Bot-Api-Secret-Token> \
* [--n 20] \
* [--cmd /wordle]
*
* No unit tests for this script — it is network-touching by design and only
* runs against a live deployed Worker. Tested manually pre-deploy.
*
* Abort guideline (debugger #2): if Atlas connection peak > 300/500 (60% cap),
* do NOT proceed with live deploy. Check Atlas UI during the 60s after the burst.
*/
// ── CLI arg parsing ──────────────────────────────────────────────────────────
function parseArgs(argv) {
const args = { n: 20, cmd: "/wordle" };
for (let i = 2; i < argv.length; i++) {
if (argv[i] === "--url") args.url = argv[++i];
else if (argv[i] === "--secret") args.secret = argv[++i];
else if (argv[i] === "--n") args.n = Number.parseInt(argv[++i], 10);
else if (argv[i] === "--cmd") args.cmd = argv[++i];
}
return args;
}
// ── Synthetic Telegram update payload ───────────────────────────────────────
/**
* Build a minimal but valid grammY-shaped Telegram Update object for a
* bot_command message.
*
* @param {string} cmd - e.g. "/wordle"
* @param {number} index - used to differentiate update_id + message_id values
* @returns {object}
*/
function buildUpdate(cmd, index) {
return {
update_id: 100000 + index,
message: {
message_id: 200000 + index,
from: { id: 1, is_bot: false, first_name: "Burst" },
chat: { id: 1, type: "private" },
date: Math.floor(Date.now() / 1000),
text: cmd,
entities: [{ type: "bot_command", offset: 0, length: cmd.length }],
},
};
}
// ── Single request ───────────────────────────────────────────────────────────
/**
* POST one synthetic update to the Worker webhook endpoint.
*
* @param {string} url - Worker base URL
* @param {string} secret - X-Telegram-Bot-Api-Secret-Token value
* @param {object} update - Telegram Update payload
* @param {number} index - request index for logging
* @returns {Promise<{ index: number, status: number, ms: number, error?: string }>}
*/
async function sendUpdate(url, secret, update, index) {
const t0 = Date.now();
const endpoint = url.replace(/\/$/, "") + "/webhook";
try {
const res = await fetch(endpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Telegram-Bot-Api-Secret-Token": secret,
},
body: JSON.stringify(update),
});
return { index, status: res.status, ms: Date.now() - t0 };
} catch (err) {
return { index, status: 0, ms: Date.now() - t0, error: err.message };
}
}
// ── Main ─────────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs(process.argv);
if (!args.url || !args.secret) {
console.error(
"Usage: node scripts/synthetic-burst.js --url <url> --secret <token> [--n 20] [--cmd /wordle]",
);
process.exit(1);
}
const { url, secret, n, cmd } = args;
console.log(`Burst: ${n} parallel requests → ${url}/webhook cmd=${cmd}`);
console.log("Starting at", new Date().toISOString());
const requests = Array.from({ length: n }, (_, i) =>
sendUpdate(url, secret, buildUpdate(cmd, i), i),
);
const results = await Promise.all(requests);
// ── Summary ──────────────────────────────────────────────────────────────
let ok = 0;
let fail = 0;
let totalMs = 0;
const statusCounts = {};
for (const r of results) {
const statusKey = r.status === 0 ? "network-error" : String(r.status);
statusCounts[statusKey] = (statusCounts[statusKey] ?? 0) + 1;
totalMs += r.ms;
if (r.status >= 200 && r.status < 300) ok++;
else fail++;
// Log individual result.
const tag = r.error ? `ERROR(${r.error})` : `HTTP ${r.status}`;
console.log(` [${r.index}] ${tag} ${r.ms}ms`);
}
const avgMs = Math.round(totalMs / n);
const allMs = results.map((r) => r.ms).sort((a, b) => a - b);
const p50 = allMs[Math.floor(allMs.length * 0.5)];
const p95 = allMs[Math.floor(allMs.length * 0.95)];
console.log("\n── Summary ───────────────────────────────────────────");
console.log(` Requests: ${n} | OK: ${ok} | Failed: ${fail}`);
console.log(` Status counts: ${JSON.stringify(statusCounts)}`);
console.log(` Latency — avg: ${avgMs}ms p50: ${p50}ms p95: ${p95}ms`);
console.log("\nNext: check Atlas UI connection counter within 60s.");
console.log("Abort if peak connections > 300/500 (60% of M0 cap).");
}
main().catch((err) => {
console.error(err);
process.exit(1);
});