Files
tiennm99 55c873965c feat(observability): phase 06 — timing telemetry + soak analyzer + burst tester
Code prerequisites for the Phase 06 cold-start soak gate. The 24-72h soak
itself is operator-run; this commit ships the instrumentation + analysis
tools needed to make the PROCEED-or-PIVOT decision.

Telemetry
- src/util/timing.js: startTiming(cmd) returns {mark, end} that emits a
  structured cmd_timing log. takeColdFlag() returns {cold, isolateAgeMs}
  using a module-scoped boolean — first request in an isolate is cold,
  subsequent are warm. This replaces the originally-planned
  isolate_age_ms < 200ms classifier (broken because Mongo cold-connect
  itself is ~1500ms; cold requests would always bucket as warm —
  code-reviewer #11).
- src/util/request-context.js: setLastCold/getLastCold shared state
  bridges fetch-level cold detection into the dispatcher middleware
  without a circular import.
- src/index.js: takeColdFlag at the top of fetch() emits a request log
  and primes the request context for the dispatcher.
- src/modules/dispatcher.js: bot.use() middleware times every command.
  Chosen over per-handler wrapping to preserve the existing identity
  assertion in tests (handler === reg.allCommands.get(name).cmd.handler)
  — single instrumentation point, no contract change.

Soak tools (operator-run)
- scripts/analyze-soak.js: parses CF Logs export (NDJSON or CSV), filters
  cmd_timing events, computes p50/p95/p99 per (cmd, cold/warm). Counts
  dual-write secondary failures, mongo connection errors, CPU-time
  exceeded events. Writes markdown report.
- scripts/synthetic-burst.js: fires N parallel synthetic Telegram updates
  at the deployed Worker URL with cache-busting tokens. Used for the
  pre-deploy connection-cap stress test (debugger #2 — 20 parallel cold
  requests, abort if Atlas peak > 60% of 500-conn cap).
- package.json: analyze:soak + burst:synthetic scripts wired.

Tests
- tests/util/timing.test.js: 8 tests — timing semantics, cold flag flip.
- tests/scripts/analyze-soak.test.js: 22 tests — percentile math, NDJSON
  + CSV parse, aggregation, markdown formatting.

Tests: 667 → 697 (+30). Lint clean.

Operator runbook for Phase 06 (NOT executed by this commit):
1. Verify telemetry live via wrangler tail.
2. Run synthetic burst test: npm run burst:synthetic -- --url <prod>
3. Configure Atlas + CF Observability email alerts.
4. 24h soak (extend to 72h on stop-conditions per phase plan).
5. Daily npm run verify:mongo.
6. npm run analyze:soak -- --input <cf-logs.json> → soak-decision.md.
7. PROCEED to Phase 07 if cold-start P95 ≤ 2.5 × BASELINE_COLD_PING_MS;
   else execute phase-07-alt-pivot.md (Upstash standby).
2026-04-26 09:22:04 +07:00

197 lines
7.1 KiB
JavaScript

import { describe, expect, it } from "vitest";
import {
aggregateLines,
formatReport,
parseLogLine,
percentile,
} from "../../scripts/analyze-soak.js";
// ── percentile ───────────────────────────────────────────────────────────────
describe("percentile", () => {
it("returns 0 for empty array", () => {
expect(percentile([], 50)).toBe(0);
});
it("returns the only element for a single-element array", () => {
expect(percentile([42], 50)).toBe(42);
expect(percentile([42], 95)).toBe(42);
});
it("p50 of [1,2,3,4,5] = 3", () => {
expect(percentile([1, 2, 3, 4, 5], 50)).toBe(3);
});
it("p100 returns last element", () => {
expect(percentile([10, 20, 30], 100)).toBe(30);
});
it("p0 returns first element", () => {
expect(percentile([10, 20, 30], 0)).toBe(10);
});
it("p95 of 100-element uniform distribution is near top", () => {
const data = Array.from({ length: 100 }, (_, i) => i + 1); // 1..100
expect(percentile(data, 95)).toBe(95);
});
});
// ── parseLogLine ─────────────────────────────────────────────────────────────
describe("parseLogLine", () => {
it("parses a plain NDJSON line", () => {
const line = '{"event":"cmd_timing","cmd":"/wordle","total":120,"cold":true,"marks":[]}';
expect(parseLogLine(line)).toEqual({
event: "cmd_timing",
cmd: "/wordle",
total: 120,
cold: true,
marks: [],
});
});
it("returns null for empty line", () => {
expect(parseLogLine("")).toBeNull();
expect(parseLogLine(" ")).toBeNull();
});
it("returns null for non-JSON plain text", () => {
expect(parseLogLine("some random log line")).toBeNull();
});
it("extracts JSON embedded in a CSV row", () => {
const csvRow =
'2024-01-01T00:00:00Z,worker,"{\\"event\\":\\"cmd_timing\\",\\"cmd\\":\\"/loldle\\",\\"total\\":45,\\"cold\\":false,\\"marks\\":[]}"';
const result = parseLogLine(csvRow);
// May fail to parse double-escaped CSV — test the raw braces path instead.
// Use a simpler CSV format where JSON is not double-escaped:
const simpleCsv =
'2024-01-01,{"event":"cmd_timing","cmd":"/loldle","total":45,"cold":false,"marks":[]},extra';
const r2 = parseLogLine(simpleCsv);
expect(r2).not.toBeNull();
expect(r2.event).toBe("cmd_timing");
expect(r2.cmd).toBe("/loldle");
});
it("returns null for malformed JSON", () => {
expect(parseLogLine("{bad json}")).toBeNull();
});
});
// ── aggregateLines ────────────────────────────────────────────────────────────
describe("aggregateLines", () => {
const makeTimingLine = (cmd, total, cold) =>
JSON.stringify({ event: "cmd_timing", cmd, total, cold, marks: [] });
it("buckets cold and warm samples separately", () => {
const lines = [
makeTimingLine("/wordle", 1500, true),
makeTimingLine("/wordle", 1400, true),
makeTimingLine("/wordle", 50, false),
makeTimingLine("/wordle", 60, false),
makeTimingLine("/wordle", 55, false),
];
const { buckets } = aggregateLines(lines, null);
expect(buckets.get("/wordle|cold").n).toBe(2);
expect(buckets.get("/wordle|warm").n).toBe(3);
});
it("filters to requested commands only", () => {
const lines = [makeTimingLine("/wordle", 100, false), makeTimingLine("/loldle", 200, false)];
const { buckets } = aggregateLines(lines, ["/wordle"]);
expect(buckets.has("/wordle|warm")).toBe(true);
expect(buckets.has("/loldle|warm")).toBe(false);
});
it("counts dual-write secondary failures", () => {
const lines = [
'{"msg":"[dual-kv] secondary write failed","phase":"dual-kv"}',
"dual-write:secondary:failed — key wordle:state:1",
makeTimingLine("/wordle", 80, false),
];
const { errors } = aggregateLines(lines, null);
expect(errors.dualWriteFail).toBe(2);
});
it("counts mongo connection errors", () => {
const lines = [
"MongoNetworkError: connection refused",
"MongoError: timeout",
makeTimingLine("/wordle", 80, false),
];
const { errors } = aggregateLines(lines, null);
expect(errors.mongoError).toBe(2);
});
it("counts CPU time exceeded errors", () => {
const lines = ["Worker exceeded CPU time limit", "cpu time exceeded for isolate"];
const { errors } = aggregateLines(lines, null);
expect(errors.cpuExceeded).toBe(2);
});
it("skips non-cmd_timing JSON events", () => {
const lines = [
JSON.stringify({ event: "request", cold: true, path: "/webhook" }),
makeTimingLine("/wordle", 90, false),
];
const { buckets } = aggregateLines(lines, null);
expect(buckets.size).toBe(1); // only the cmd_timing entry
});
it("returns empty buckets and zero errors for empty input", () => {
const { buckets, errors } = aggregateLines([], null);
expect(buckets.size).toBe(0);
expect(errors.dualWriteFail).toBe(0);
expect(errors.mongoError).toBe(0);
expect(errors.cpuExceeded).toBe(0);
});
});
// ── formatReport ─────────────────────────────────────────────────────────────
describe("formatReport", () => {
it("produces a markdown table with correct headers", () => {
const lines = [
JSON.stringify({ event: "cmd_timing", cmd: "/wordle", total: 1500, cold: true, marks: [] }),
JSON.stringify({ event: "cmd_timing", cmd: "/wordle", total: 50, cold: false, marks: [] }),
];
const { buckets, errors } = aggregateLines(lines, null);
const report = formatReport(buckets, errors);
expect(report).toContain("| cmd | cold/warm | n | p50 | p95 | p99 |");
expect(report).toContain("/wordle");
expect(report).toContain("cold");
expect(report).toContain("warm");
});
it("includes error summary section", () => {
const { buckets, errors } = aggregateLines([], null);
errors.dualWriteFail = 3;
const report = formatReport(buckets, errors);
expect(report).toContain("## Error Summary");
expect(report).toContain("Dual-write secondary failures: 3");
expect(report).toContain("Mongo connection errors: 0");
expect(report).toContain("CPU time exceeded: 0");
});
it("computes correct p50/p95/p99 for a known dataset", () => {
// 10 warm samples: 10,20,...,100
const lines = Array.from({ length: 10 }, (_, i) =>
JSON.stringify({
event: "cmd_timing",
cmd: "/ping",
total: (i + 1) * 10,
cold: false,
marks: [],
}),
);
const { buckets, errors } = aggregateLines(lines, null);
const report = formatReport(buckets, errors);
// p50 of [10,20,30,40,50,60,70,80,90,100] = 50 (index 4 with nearest-rank)
// p95 = 95th percentile = index ceil(0.95*10)-1 = ceil(9.5)-1 = 10-1 = 9 → 100
expect(report).toContain("| /ping | warm | 10 |");
});
});