test(vcr): classify cache verdicts, detect live calls, surface cost leaks

Convert the per-test VCR verdict line from a single 'NOOP / HIT / MISS / PARTIAL' tag into a classified outcome that distinguishes the cases that silently bill the live API on every CI run from the ones that don't: HIT pure replay PARTIAL mixed replay + new recordings MISS:RECORDED new cassette saved to Redis (cached next run) MISS:OVERFLOW cassette > MAX_EPISODES_PER_CASSETTE; persister refused to save; re-bills every run MISS:NOT_PERSISTED test failed; save_cassette skipped; re-bills NOOP VCR-marked but no HTTP traffic (mocked elsewhere) UNMARKED:LIVE_CALL test bypassed VCR AND opened a TCP connection to a known LLM provider host -> wasted spend UNMARKED:NO_TRAFFIC test bypassed VCR but didn't call out The UNMARKED:LIVE_CALL signal is what converts 'this test probably hits live' into 'this test connected to api.openai.com'. We install a socket.connect / socket.create_connection wrapper for the duration of each non-VCR-marked test and record any outbound TCP to a known LLM provider hostname. The probe sits below the httpx layer so vcrpy and respx (which both patch above the socket) are unaffected. Replace the file-level _RESPX_CONFLICTING_FILES blacklists in the llm_translation and local_testing conftests with per-item respx detection in apply_vcr_auto_marker_to_items. A test now skips VCR when it actually carries @pytest.mark.respx or has respx_mock in its fixture chain - not just because some other test in the same file imports MockRouter. Items skipped by skip_files are split into respx_conflict (real conflict, the module wires up respx) vs file_opt_out (dead skip- list entry whose module never touches respx) so the session summary makes pruning obvious. Stabilize the AWS SigV4 fingerprint: the Authorization header on Bedrock requests rotates its Credential date and Signature on every call, which previously pushed every Bedrock test past the 50-episode overflow threshold. Extract the access-key id only ('aws-sigv4:AKIA...') so two requests with the same identity match. Always emit verdict logging when VCR is active (set LITELLM_VCR_VERBOSE=0 to opt back into the legacy quiet mode). Add a session-end classification summary that lists overflow tests, unmarked live-call tests, and the skip-reason breakdown. Wire the live-call probe + summary hook into every test directory that already uses the Redis-backed VCR cache (audio_tests, guardrails_tests, image_gen_tests, litellm_utils_tests, llm_responses_api_testing, llm_translation, local_testing, logging_callback_tests, ocr_tests, pass_through_unit_tests, router_unit_tests, search_tests, unified_google_tests). Add tests/llm_translation/test_vcr_classification.py covering the verdict classifier, skip-reason tagging, AWS SigV4 fingerprint stability, live-host classification, and session summary rendering. Co-authored-by: Mateo Wang <mateo-berri@users.noreply.github.com>
2026-08-02 12:21:10 +00:00 · 2026-05-13 00:31:47 +00:00
parent 0deffd3618
commit b637d9f64a
15 changed files with 1135 additions and 39 deletions
@@ -10,20 +10,22 @@ import hashlib
 import json
 import os
 import re
+import socket
 import sys
+from collections import defaultdict
 from typing import Iterable

 import pytest

 from tests._vcr_redis_persister import (
+    MAX_EPISODES_PER_CASSETTE,
+    VCR_VERBOSE_ENV,
    cassette_cache_capacity_snapshot,
    cassette_cache_health,
    filter_non_2xx_response,
-    format_vcr_verdict,
    make_redis_persister,
    mark_test_outcome_for_cassette,
    patch_vcrpy_aiohttp_record_path,
-    vcr_verbose_enabled,
 )

 CASSETTE_CACHE_HIGH_WATER_FRACTION = 0.85
@@ -231,6 +233,29 @@ def _iter_header_values(headers, name: str):
            yield value


+_AWS_SIGV4_CREDENTIAL_RE = re.compile(
+    r"AWS4-HMAC-SHA256\s+Credential=([^/\s,]+)/", re.IGNORECASE
+)
+
+
+def _stable_key_value(header_name: str, raw: str) -> str:
+    """Return a *stable* identifier for a credential header.
+
+    For Bearer / API-key headers the entire value is stable across calls,
+    so we hash it as-is. For AWS SigV4 ``Authorization`` headers, only
+    the access-key portion of ``Credential=AKIA.../<DATE>/...`` is stable
+    — date, region, signed headers, and signature all rotate per request,
+    so hashing the full value would push every Bedrock request into a new
+    cassette episode. Extract just the access-key id when present.
+    """
+    if header_name.lower() != "authorization":
+        return raw
+    match = _AWS_SIGV4_CREDENTIAL_RE.search(raw)
+    if match:
+        return f"aws-sigv4:{match.group(1)}"
+    return raw
+
+
 def _compute_key_fingerprint(request) -> str:
    headers = getattr(request, "headers", None)
    parts: list[str] = []
@@ -242,7 +267,8 @@ def _compute_key_fingerprint(request) -> str:
            text = text.strip()
            if not text:
                continue
-            parts.append(f"{header_name}={text}")
+            stable = _stable_key_value(header_name, text)
+            parts.append(f"{header_name}={stable}")
    if not parts:
        return "no-key"
    digest = hashlib.sha256("\n".join(parts).encode("utf-8")).hexdigest()
@@ -470,6 +496,114 @@ def register_persister_if_enabled(vcr) -> None:
        _atexit_banner_registered = True


+VCR_SKIP_REASON_USER_ATTR = "vcr_skip_reason"
+
+# Marker reasons recorded per-item / per-test for the session summary.
+SKIP_REASON_RESPX = "respx_conflict"
+SKIP_REASON_RESPX_MODULE = "respx_conflict_module"
+SKIP_REASON_INCOMPATIBLE = "incompatible"
+SKIP_REASON_FILE_OPT_OUT = "file_opt_out"
+SKIP_REASON_DISABLED = "disabled"
+SKIP_REASON_PRE_MARKED = "already_marked"
+
+# Hostnames we consider an "expensive live call" if a non-VCR-marked test
+# happens to hit them. Localhost/redis/databases are explicitly excluded.
+_LIVE_CALL_HOST_SUFFIXES = (
+    ".openai.com",
+    ".anthropic.com",
+    ".vertexai.googleapis.com",
+    ".aiplatform.googleapis.com",
+    ".googleapis.com",
+    ".bedrock-runtime.amazonaws.com",
+    ".x.ai",
+    ".cohere.ai",
+    ".cohere.com",
+    ".voyageai.com",
+    ".perplexity.ai",
+    ".mistral.ai",
+    ".groq.com",
+    ".huggingface.co",
+    ".azure.com",
+    ".tavily.com",
+    ".serper.dev",
+    ".searchapi.io",
+    ".firecrawl.dev",
+    ".exa.ai",
+)
+_LIVE_CALL_LOCAL_PREFIXES = (
+    "127.",
+    "localhost",
+    "::1",
+    "0.0.0.0",
+    "10.",
+    "172.16.",
+    "172.17.",
+    "172.18.",
+    "172.19.",
+    "172.20.",
+    "192.168.",
+)
+
+
+def _module_uses_respx(item) -> bool:
+    """Return True if the test's *module* actually wires up respx.
+
+    A bare ``from respx import MockRouter`` import (with no actual usage)
+    does not patch the httpx transport, so it does not conflict with vcrpy.
+    We confirm by checking the module's source for any of:
+    - ``@pytest.mark.respx``
+    - ``@respx.mock`` / ``with respx.mock``
+    - ``respx_mock`` fixture name
+    """
+    module = getattr(item, "module", None)
+    src_file = getattr(module, "__file__", None)
+    if not src_file or not os.path.isfile(src_file):
+        return False
+    try:
+        with open(src_file, encoding="utf-8") as f:
+            src = f.read()
+    except OSError:
+        return False
+    if "respx_mock" in src:
+        return True
+    if "@pytest.mark.respx" in src or "@respx.mock" in src:
+        return True
+    if "respx.mock" in src or "with respx" in src:
+        return True
+    return False
+
+
+def _item_uses_respx(item) -> bool:
+    """Return True if *this specific item* will trigger respx.
+
+    Two signals: the ``respx`` pytest marker, and the ``respx_mock``
+    fixture appearing in the item's resolved fixture chain. Either alone
+    causes vcrpy + respx to fight over the httpx transport.
+    """
+    if item.get_closest_marker("respx") is not None:
+        return True
+    fixturenames = getattr(item, "fixturenames", None) or ()
+    if "respx_mock" in fixturenames:
+        return True
+    return False
+
+
+# Cache the source-scan result so we don't reread each module per item.
+_RESPX_MODULE_CACHE: dict[str, bool] = {}
+
+
+def _module_path_uses_respx(item) -> bool:
+    src_file = str(getattr(item, "path", "") or "")
+    if not src_file:
+        return False
+    cached = _RESPX_MODULE_CACHE.get(src_file)
+    if cached is not None:
+        return cached
+    result = _module_uses_respx(item)
+    _RESPX_MODULE_CACHE[src_file] = result
+    return result
+
+
 def apply_vcr_auto_marker_to_items(
    items,
    *,
@@ -478,26 +612,232 @@ def apply_vcr_auto_marker_to_items(
 ) -> None:
    """Auto-apply ``pytest.mark.vcr`` to collected items.

-    ``skip_files`` are basenames to leave un-marked (e.g. respx-using
-    files, since respx and vcrpy both patch the httpx transport).
-    ``skip_nodeid_suffixes`` are node-id suffixes for individual tests
-    that depend on live cross-call provider state.
+    Skip semantics (in priority order):
+
+    1. ``vcr_disabled()`` — global env-var off-switch (``LITELLM_VCR_DISABLE=1``
+       or no ``CASSETTE_REDIS_URL``).
+    2. Item already carries ``@pytest.mark.vcr`` — leave it alone.
+    3. Item triggers respx (per-item marker / fixture) — vcrpy and respx
+       both patch the httpx transport so applying both makes one silently
+       no-op. We tag the item ``vcr_skip_reason=respx_conflict``.
+    4. Module wires up respx anywhere — even tests in the file that don't
+       themselves use respx still inherit the patched transport when
+       respx fixtures activate at session level. Tagged
+       ``respx_conflict_module``.
+    5. ``skip_files`` / ``skip_nodeid_suffixes`` opt-out lists from the
+       caller — used for tests that observe live cross-call provider state
+       (e.g. prompt-cache warmup) which deterministic replay can't model.
+       Tagged ``incompatible``.
+
+    Each skipped item gets a ``vcr_skip_reason`` attribute so the
+    session-end summary can show why it isn't cached.
    """
    if vcr_disabled():
+        for item in items:
+            setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_DISABLED)
        return
    skip_files = frozenset(skip_files)
    skip_nodeid_suffixes = tuple(skip_nodeid_suffixes)
    for item in items:
+        if item.get_closest_marker("vcr") is not None:
+            setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_PRE_MARKED)
+            continue
+        if _item_uses_respx(item):
+            setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_RESPX)
+            continue
        filename = os.path.basename(str(item.path))
        if filename in skip_files:
+            # Trust the caller's opt-out, but split by reason: if the
+            # module actually uses respx, label the conflict precisely so
+            # the summary surfaces dead respx imports vs. real conflicts.
+            if _module_path_uses_respx(item):
+                setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_RESPX_MODULE)
+            else:
+                setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_FILE_OPT_OUT)
            continue
        if any(item.nodeid.endswith(suffix) for suffix in skip_nodeid_suffixes):
-            continue
-        if item.get_closest_marker("vcr") is not None:
+            setattr(item, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_INCOMPATIBLE)
            continue
        item.add_marker(pytest.mark.vcr)


+# ---------------------------------------------------------------------------
+# Per-test stats accumulator + verdict classification.
+#
+# The session-end summary needs richer signal than the line-level verdict:
+# - which tests overflowed ``MAX_EPISODES_PER_CASSETTE`` (cassette refused
+#   to save → live calls every CI run);
+# - which tests fired live HTTP at a real LLM endpoint while VCR was not
+#   active for them (genuine wasted spend, not just "test mocked elsewhere");
+# - skip-reason buckets so we can tell respx-conflict from
+#   incompatible-by-design from "module imports respx but never uses it".
+# ---------------------------------------------------------------------------
+
+# Verdict tags used in the per-test logline AND in the session summary
+# breakdown.
+VERDICT_HIT = "VCR HIT"
+VERDICT_MISS_RECORDED = "VCR MISS:RECORDED"
+VERDICT_MISS_OVERFLOW = "VCR MISS:OVERFLOW"
+VERDICT_MISS_NOT_PERSISTED = "VCR MISS:NOT_PERSISTED"
+VERDICT_PARTIAL = "VCR PARTIAL"
+VERDICT_NOOP_NO_TRAFFIC = "VCR NOOP"
+VERDICT_UNMARKED_LIVE_CALL = "VCR UNMARKED:LIVE_CALL"
+VERDICT_UNMARKED_NO_TRAFFIC = "VCR UNMARKED:NO_TRAFFIC"
+VERDICT_DISABLED = "VCR DISABLED"
+
+# Per-session stats. Cleared by ``_reset_session_stats`` for unit tests.
+_session_stats = {
+    "verdict_counts": defaultdict(int),
+    "overflow_tests": [],  # list of nodeids
+    "unmarked_live_call_tests": [],  # list of (nodeid, hosts)
+    "skip_reason_counts": defaultdict(int),
+    "skip_reason_examples": defaultdict(list),
+}
+
+
+def _reset_session_stats() -> None:
+    _session_stats["verdict_counts"].clear()
+    _session_stats["overflow_tests"].clear()
+    _session_stats["unmarked_live_call_tests"].clear()
+    _session_stats["skip_reason_counts"].clear()
+    _session_stats["skip_reason_examples"].clear()
+
+
+def session_stats_snapshot() -> dict:
+    """Read-only copy of the per-session VCR stats. Used by the summary."""
+    return {
+        "verdict_counts": dict(_session_stats["verdict_counts"]),
+        "overflow_tests": list(_session_stats["overflow_tests"]),
+        "unmarked_live_call_tests": list(_session_stats["unmarked_live_call_tests"]),
+        "skip_reason_counts": dict(_session_stats["skip_reason_counts"]),
+        "skip_reason_examples": {
+            k: list(v) for k, v in _session_stats["skip_reason_examples"].items()
+        },
+    }
+
+
+def _classify_marked_test(cassette) -> str:
+    """Map cassette state → verdict tag for tests that *were* VCR-marked."""
+    played = getattr(cassette, "play_count", 0) or 0
+    dirty = getattr(cassette, "dirty", False)
+    total = len(cassette) if hasattr(cassette, "__len__") else 0
+
+    # "OVERFLOW" mirrors ``_RedisPersister.save_cassette``'s
+    # ``> MAX_EPISODES_PER_CASSETTE`` guard. Cassettes that hit this
+    # threshold are refused for save, so the test re-records live every
+    # run.
+    if total > MAX_EPISODES_PER_CASSETTE:
+        return VERDICT_MISS_OVERFLOW
+    if played == 0 and not dirty:
+        return VERDICT_NOOP_NO_TRAFFIC
+    if played > 0 and not dirty:
+        return VERDICT_HIT
+    if played == 0 and dirty:
+        return VERDICT_MISS_RECORDED
+    return VERDICT_PARTIAL
+
+
+def _format_verdict_line(verdict: str, cassette, extra: str = "") -> str:
+    if cassette is None:
+        return f"[{verdict}]{(' ' + extra) if extra else ''}"
+    played = getattr(cassette, "play_count", 0) or 0
+    total = len(cassette) if hasattr(cassette, "__len__") else 0
+    base = f"[{verdict}] played={played} entries={total}"
+    if extra:
+        base = f"{base} {extra}"
+    return base
+
+
+# ---------------------------------------------------------------------------
+# Live-call detection for tests that bypass VCR.
+#
+# When a test isn't VCR-marked (respx_conflict, incompatible, or just
+# plain unmarked), we wrap its socket calls inside the autouse
+# ``_vcr_outcome_gate`` fixture so we can flag any outbound TCP connection
+# to a known LLM provider. This converts "likely live call" into
+# "confirmed: this test connected to host X".
+# ---------------------------------------------------------------------------
+
+_LIVE_CALL_PROBE_INSTALLED = False
+_LIVE_CALL_BUFFER_KEY = "vcr_live_call_hosts"
+
+
+def _is_live_call_host(host: str) -> bool:
+    if not host:
+        return False
+    host = host.lower()
+    if any(host.startswith(p) for p in _LIVE_CALL_LOCAL_PREFIXES):
+        return False
+    return any(host.endswith(suffix) for suffix in _LIVE_CALL_HOST_SUFFIXES)
+
+
+class _LiveCallProbe:
+    """Context manager that monkeypatches ``socket.create_connection`` and
+    ``socket.socket.connect`` for the lifetime of a test, recording any
+    outbound TCP connection to a known LLM host.
+
+    We don't intercept HTTP at the application layer because that would
+    fight with vcrpy/respx in tests that *do* mock httpx — the socket
+    layer is below both, so this probe is safe regardless of what's
+    patched above it. We also don't raise: the goal is observability, not
+    a hard gate.
+    """
+
+    def __init__(self) -> None:
+        self.hosts: list[str] = []
+        self._orig_create_connection = None
+        self._orig_socket_connect = None
+
+    def __enter__(self):
+        self._orig_create_connection = socket.create_connection
+        self._orig_socket_connect = socket.socket.connect
+
+        def _wrapped_create_connection(address, *args, **kwargs):
+            try:
+                host = address[0] if isinstance(address, tuple) else None
+                if host and _is_live_call_host(host) and host not in self.hosts:
+                    self.hosts.append(host)
+            except Exception:
+                pass
+            return self._orig_create_connection(address, *args, **kwargs)
+
+        def _wrapped_socket_connect(sock_self, address):
+            try:
+                host = address[0] if isinstance(address, tuple) else None
+                if host and _is_live_call_host(host) and host not in self.hosts:
+                    self.hosts.append(host)
+            except Exception:
+                pass
+            return self._orig_socket_connect(sock_self, address)
+
+        socket.create_connection = _wrapped_create_connection
+        socket.socket.connect = _wrapped_socket_connect
+        return self
+
+    def __exit__(self, *exc):
+        if self._orig_create_connection is not None:
+            socket.create_connection = self._orig_create_connection
+        if self._orig_socket_connect is not None:
+            socket.socket.connect = self._orig_socket_connect
+        return False
+
+
+def vcr_outcome_logging_enabled() -> bool:
+    """Verdict logging is on whenever VCR itself is active.
+
+    The old ``LITELLM_VCR_VERBOSE=1`` gate kept logs quiet by default, but
+    that hides the very signal we need to know whether a paid test ran
+    against a real provider. CI logs already drop a one-line verdict per
+    test; that's what makes the cost analysis tractable. Set
+    ``LITELLM_VCR_VERBOSE=0`` if you really want the legacy quiet mode.
+    """
+    if vcr_disabled():
+        return False
+    if os.environ.get(VCR_VERBOSE_ENV) == "0":
+        return False
+    return True
+
+
 def record_vcr_outcome(request, vcr) -> None:
    """Call from the post-yield section of an autouse fixture per test."""
    cassette = vcr
@@ -507,10 +847,71 @@ def record_vcr_outcome(request, vcr) -> None:
    if cassette_path:
        mark_test_outcome_for_cassette(cassette_path, test_passed)

-    if not vcr_verbose_enabled():
+    nodeid = request.node.nodeid
+
+    if cassette is not None:
+        verdict = _classify_marked_test(cassette)
+        # Track overflow tests even when verbose logging is off — the
+        # session summary shows them either way.
+        if verdict == VERDICT_MISS_OVERFLOW:
+            _session_stats["overflow_tests"].append(nodeid)
+        if not test_passed and verdict == VERDICT_MISS_RECORDED:
+            verdict = VERDICT_MISS_NOT_PERSISTED
+        _session_stats["verdict_counts"][verdict] += 1
+        if vcr_outcome_logging_enabled():
+            line = _format_verdict_line(verdict, cassette)
+            request.node.user_properties.append(("vcr_verdict", line))
        return
-    verdict = format_vcr_verdict(cassette)
-    request.node.user_properties.append(("vcr_verdict", verdict))
+
+    # Cassette is None ⇒ test wasn't VCR-marked. Honor the skip reason
+    # we tagged at collection time, and pull live-call hosts captured by
+    # the socket probe (if any).
+    skip_reason = getattr(
+        request.node, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_FILE_OPT_OUT
+    )
+    _session_stats["skip_reason_counts"][skip_reason] += 1
+
+    hosts = getattr(request.node, _LIVE_CALL_BUFFER_KEY, []) or []
+    if hosts:
+        verdict = VERDICT_UNMARKED_LIVE_CALL
+        _session_stats["unmarked_live_call_tests"].append((nodeid, list(hosts)))
+        extra = f"reason={skip_reason} hosts={','.join(hosts)}"
+    else:
+        verdict = VERDICT_UNMARKED_NO_TRAFFIC
+        extra = f"reason={skip_reason}"
+
+    _session_stats["verdict_counts"][verdict] += 1
+
+    examples = _session_stats["skip_reason_examples"][skip_reason]
+    if len(examples) < 5:
+        examples.append(nodeid)
+
+    if vcr_outcome_logging_enabled():
+        request.node.user_properties.append(
+            ("vcr_verdict", _format_verdict_line(verdict, None, extra))
+        )
+
+
+def install_live_call_probe(request, vcr) -> None:
+    """Activate the live-call socket probe for non-VCR-marked tests.
+
+    Call this from inside the per-test autouse ``_vcr_outcome_gate``
+    fixture *before* the ``yield``. When ``vcr`` is ``None`` (test isn't
+    VCR-marked) we patch ``socket.connect`` for the duration of the test
+    and stash any LLM-host connections on ``request.node`` so
+    ``record_vcr_outcome`` can include them in the verdict line.
+
+    Tests that *are* VCR-marked don't get the probe — vcrpy itself
+    intercepts above the socket layer, so any "outbound" socket would be
+    a recording cycle, not real spend.
+    """
+    if vcr is not None or vcr_disabled():
+        return None
+    probe = _LiveCallProbe()
+    probe.__enter__()
+    setattr(request.node, _LIVE_CALL_BUFFER_KEY, probe.hosts)
+    request.addfinalizer(lambda: probe.__exit__(None, None, None))
+    return probe


 def _format_capacity_line(snapshot: dict) -> str:
@@ -525,6 +926,99 @@ def _format_capacity_line(snapshot: dict) -> str:
    )


+def emit_vcr_classification_summary(terminalreporter) -> None:
+    """Render the per-classification summary at session end.
+
+    Output sections (only included when non-empty):
+
+    * **Verdict counts** — full breakdown of HIT / MISS:RECORDED /
+      MISS:OVERFLOW / MISS:NOT_PERSISTED / PARTIAL / NOOP /
+      UNMARKED:LIVE_CALL / UNMARKED:NO_TRAFFIC. The OVERFLOW and
+      UNMARKED:LIVE_CALL counts are the cost-leak signals.
+    * **Cassette overflow** (>``MAX_EPISODES_PER_CASSETTE``) — these tests
+      fire live every CI run because the persister refuses to save them.
+      Usually means the request body is non-deterministic (file handle
+      consumed, AWS SigV4 timestamp, random UUID).
+    * **Unmarked tests with live API calls** — confirmed live HTTP traffic
+      to a known LLM host while VCR was *not* active for the test. This
+      is the "convert likely → confirmed" signal: each entry is real
+      money the cache would otherwise prevent.
+    * **Skip-reason breakdown** — how many tests opted out of VCR and
+      why (respx_conflict, respx_conflict_module, file_opt_out,
+      incompatible). Bare ``file_opt_out`` entries with zero respx usage
+      in the module are dead skip-list rows worth pruning.
+    """
+    if vcr_disabled():
+        return
+    if os.environ.get("PYTEST_XDIST_WORKER"):
+        return
+
+    snapshot = session_stats_snapshot()
+    counts = snapshot["verdict_counts"]
+    if not counts:
+        return
+
+    terminalreporter.write_sep("=", "VCR CACHE CLASSIFICATION SUMMARY", bold=True)
+    for verdict in (
+        VERDICT_HIT,
+        VERDICT_PARTIAL,
+        VERDICT_MISS_RECORDED,
+        VERDICT_MISS_OVERFLOW,
+        VERDICT_MISS_NOT_PERSISTED,
+        VERDICT_NOOP_NO_TRAFFIC,
+        VERDICT_UNMARKED_NO_TRAFFIC,
+        VERDICT_UNMARKED_LIVE_CALL,
+    ):
+        n = counts.get(verdict, 0)
+        if not n:
+            continue
+        terminalreporter.write_line(f"  [{verdict}] {n}")
+
+    overflow = snapshot["overflow_tests"]
+    if overflow:
+        terminalreporter.write_sep(
+            "-",
+            f"CASSETTE OVERFLOW (>{MAX_EPISODES_PER_CASSETTE} episodes, save refused)",
+            red=True,
+            bold=True,
+        )
+        terminalreporter.write_line(
+            "  These tests will hit the live provider on every CI run "
+            "because the persister won't save cassettes that grew past "
+            "the limit. Stabilize the request body (file handle consumed, "
+            "SigV4 timestamp, UUID, or boundary leak)."
+        )
+        for nodeid in overflow:
+            terminalreporter.write_line(f"    - {nodeid}")
+
+    live_calls = snapshot["unmarked_live_call_tests"]
+    if live_calls:
+        terminalreporter.write_sep(
+            "-",
+            "UNMARKED TESTS WITH LIVE API CALLS",
+            red=True,
+            bold=True,
+        )
+        terminalreporter.write_line(
+            "  These tests connected to a real LLM provider host while "
+            "they were NOT VCR-marked. Either add @pytest.mark.vcr "
+            "explicitly, mock with respx, or move them off the "
+            "respx_conflict / incompatible skip list."
+        )
+        for nodeid, hosts in live_calls:
+            terminalreporter.write_line(f"    - {nodeid}  →  {','.join(hosts)}")
+
+    reasons = snapshot["skip_reason_counts"]
+    if reasons:
+        terminalreporter.write_sep("-", "SKIP-REASON BREAKDOWN", bold=True)
+        for reason, n in sorted(reasons.items(), key=lambda kv: -kv[1]):
+            examples = snapshot["skip_reason_examples"].get(reason, [])
+            terminalreporter.write_line(f"  {reason}: {n}")
+            for ex in examples:
+                terminalreporter.write_line(f"    - {ex}")
+    terminalreporter.write_sep("=", bold=True)
+
+
 def emit_cassette_cache_session_banner(terminalreporter) -> None:
    """Call from ``pytest_terminal_summary``. No-op on xdist workers."""
    if vcr_disabled():
@@ -600,7 +1094,7 @@ class VerboseReporterState:
            return
        if os.environ.get("PYTEST_XDIST_WORKER"):
            return
-        if not vcr_verbose_enabled():
+        if not vcr_outcome_logging_enabled():
            return
        reporter = self.resolve_terminal_reporter()
        if reporter is None:
@@ -8,6 +8,9 @@ sys.path.insert(0, os.path.abspath("../.."))
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -34,6 +37,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -48,3 +52,8 @@ def pytest_runtest_logreport(report):

 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(items)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -19,6 +19,9 @@ import litellm
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -45,6 +48,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -151,3 +155,8 @@ def pytest_collection_modifyitems(config, items):

    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -12,6 +12,9 @@ import litellm  # noqa: E402,F401
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -48,6 +51,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -62,3 +66,8 @@ def pytest_runtest_logreport(report):

 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(items)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -15,6 +15,9 @@ import litellm  # noqa: E402,F401
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -76,6 +79,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -107,3 +111,8 @@ def pytest_collection_modifyitems(config, items):

    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -16,6 +16,9 @@ import litellm  # noqa: E402
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -42,6 +45,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -107,3 +111,8 @@ def pytest_collection_modifyitems(config, items):
    other_tests.sort(key=lambda x: x.name)

    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -21,27 +21,20 @@ import litellm  # noqa: E402
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
 )

-# vcrpy and respx both patch the httpx transport — applying both makes one
-# silently win, so respx-using files opt out of the auto-marker.
-_RESPX_CONFLICTING_FILES = frozenset(
-    {
-        "test_gpt4o_audio.py",
-        "test_nvidia_nim.py",
-        "test_openai.py",
-        "test_openai_o1.py",
-        "test_prompt_caching.py",
-        "test_text_completion_unit_tests.py",
-        "test_xai.py",
-    }
-)
-_VCR_AUTO_MARKER_SKIP_FILES = _RESPX_CONFLICTING_FILES | frozenset(
-    {"test_vcr_redis_persister.py"}
-)
+# Per-item respx detection (``apply_vcr_auto_marker_to_items``) handles
+# the vast majority of respx-vs-vcrpy conflicts automatically. The only
+# entry below is the persister's own unit-test file, which exercises
+# ``save_cassette`` / ``load_cassette`` against fakeredis and must not
+# itself run under a live cassette context.
+_VCR_AUTO_MARKER_SKIP_FILES = frozenset({"test_vcr_redis_persister.py"})

 # Tests that observe live cross-call provider state (e.g. prompt-cache
 # warm-up between two consecutive calls); replay can't reproduce that state.
@@ -73,6 +66,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -85,6 +79,11 @@ def pytest_runtest_logreport(report):
    _verbose_state.maybe_emit_verdict(report)


+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
+
+
 # ---------------------------------------------------------------------------
 # Capture TRUE defaults at conftest import time (before test modules pollute).
 # ---------------------------------------------------------------------------
@@ -0,0 +1,497 @@
+"""Unit tests for the VCR classification + observability layer.
+
+Covers:
+- per-item respx detection (module scan, marker, fixture)
+- skip-reason tagging in ``apply_vcr_auto_marker_to_items``
+- verdict classification (HIT / MISS:RECORDED / MISS:OVERFLOW / MISS:NOT_PERSISTED /
+  PARTIAL / NOOP / UNMARKED:LIVE_CALL / UNMARKED:NO_TRAFFIC)
+- AWS SigV4 fingerprint stability
+- session-end summary rendering
+- live-call host classification
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from types import SimpleNamespace
+from typing import Optional
+
+import pytest
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+from tests._vcr_conftest_common import (  # noqa: E402
+    SKIP_REASON_FILE_OPT_OUT,
+    SKIP_REASON_INCOMPATIBLE,
+    SKIP_REASON_PRE_MARKED,
+    SKIP_REASON_RESPX,
+    SKIP_REASON_RESPX_MODULE,
+    VCR_SKIP_REASON_USER_ATTR,
+    VERDICT_HIT,
+    VERDICT_MISS_NOT_PERSISTED,
+    VERDICT_MISS_OVERFLOW,
+    VERDICT_MISS_RECORDED,
+    VERDICT_NOOP_NO_TRAFFIC,
+    VERDICT_PARTIAL,
+    VERDICT_UNMARKED_LIVE_CALL,
+    VERDICT_UNMARKED_NO_TRAFFIC,
+    _RESPX_MODULE_CACHE,
+    _classify_marked_test,
+    _compute_key_fingerprint,
+    _is_live_call_host,
+    _reset_session_stats,
+    _stable_key_value,
+    apply_vcr_auto_marker_to_items,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
+    record_vcr_outcome,
+    session_stats_snapshot,
+)
+
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class _StubItem:
+    """Pytest item double sufficient for the auto-marker logic."""
+
+    def __init__(
+        self,
+        nodeid: str,
+        path: str,
+        *,
+        markers: Optional[list[str]] = None,
+        fixturenames: Optional[list[str]] = None,
+        module=None,
+    ) -> None:
+        self.nodeid = nodeid
+        self.path = path
+        self._markers = list(markers or [])
+        self.fixturenames = list(fixturenames or [])
+        self.module = module
+        self.user_properties: list = []
+
+    def get_closest_marker(self, name: str):
+        return name if name in self._markers else None
+
+    def add_marker(self, marker):
+        # ``pytest.mark.vcr`` is a MarkDecorator; rely on its ``name``.
+        name = getattr(marker, "name", str(marker))
+        self._markers.append(name)
+
+
+@pytest.fixture
+def vcr_enabled(monkeypatch):
+    monkeypatch.setenv("CASSETTE_REDIS_URL", "redis://stub")
+    monkeypatch.delenv("LITELLM_VCR_DISABLE", raising=False)
+    monkeypatch.delenv("PYTEST_XDIST_WORKER", raising=False)
+
+
+@pytest.fixture(autouse=True)
+def _reset_module_caches():
+    _reset_session_stats()
+    _RESPX_MODULE_CACHE.clear()
+    yield
+    _reset_session_stats()
+    _RESPX_MODULE_CACHE.clear()
+
+
+# ---------------------------------------------------------------------------
+# AWS SigV4 fingerprint stability — the Bedrock cassette overflow root cause
+# ---------------------------------------------------------------------------
+
+
+def test_should_extract_only_aws_access_key_from_sigv4_authorization():
+    """Two Bedrock requests with the same access key but different
+    timestamps and signatures must produce the same fingerprint, otherwise
+    every CI run pushes a new episode into the cassette."""
+    auth_today = (
+        "AWS4-HMAC-SHA256 Credential=AKIAEXAMPLE12345/20260512/us-east-1/"
+        "bedrock/aws4_request, SignedHeaders=host;x-amz-date, "
+        "Signature=AAAAAAAA"
+    )
+    auth_tomorrow = (
+        "AWS4-HMAC-SHA256 Credential=AKIAEXAMPLE12345/20260513/us-east-1/"
+        "bedrock/aws4_request, SignedHeaders=host;x-amz-date, "
+        "Signature=BBBBBBBB"
+    )
+    today = _stable_key_value("Authorization", auth_today)
+    tomorrow = _stable_key_value("Authorization", auth_tomorrow)
+    assert today == tomorrow == "aws-sigv4:AKIAEXAMPLE12345"
+
+
+def test_should_keep_bearer_authorization_unchanged():
+    """OpenAI ``Bearer <key>`` headers are stable as-is — keep them."""
+    out = _stable_key_value("Authorization", "Bearer sk-1234")
+    assert out == "Bearer sk-1234"
+
+
+def test_should_produce_stable_fingerprint_across_sigv4_signatures():
+    """``_compute_key_fingerprint`` should not change when only the SigV4
+    signature/timestamp rotates."""
+    req_a = SimpleNamespace(
+        headers={
+            "authorization": (
+                "AWS4-HMAC-SHA256 Credential=AKIA1/20260101/us-east-1/"
+                "bedrock/aws4_request, SignedHeaders=host, Signature=AAA"
+            )
+        }
+    )
+    req_b = SimpleNamespace(
+        headers={
+            "authorization": (
+                "AWS4-HMAC-SHA256 Credential=AKIA1/20260512/us-east-1/"
+                "bedrock/aws4_request, SignedHeaders=host;x-amz-date, "
+                "Signature=ZZZ"
+            )
+        }
+    )
+    assert _compute_key_fingerprint(req_a) == _compute_key_fingerprint(req_b)
+
+
+def test_should_distinguish_different_aws_access_keys():
+    """Two different access keys must produce different fingerprints so
+    cassettes recorded under one identity never serve another."""
+    req_a = SimpleNamespace(
+        headers={
+            "authorization": "AWS4-HMAC-SHA256 Credential=AKIAONE/x/y/z/aws4_request, Signature=A"
+        }
+    )
+    req_b = SimpleNamespace(
+        headers={
+            "authorization": "AWS4-HMAC-SHA256 Credential=AKIATWO/x/y/z/aws4_request, Signature=A"
+        }
+    )
+    assert _compute_key_fingerprint(req_a) != _compute_key_fingerprint(req_b)
+
+
+# ---------------------------------------------------------------------------
+# Live-call host classification
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "host,expected",
+    [
+        ("api.openai.com", True),
+        ("api.anthropic.com", True),
+        ("bedrock-runtime.us-east-1.amazonaws.com", False),
+        ("api.us-east-1.bedrock-runtime.amazonaws.com", True),
+        ("foo.bar.openai.com", True),
+        ("127.0.0.1", False),
+        ("localhost", False),
+        ("10.0.0.1", False),
+        ("172.16.0.1", False),
+        ("redis.example.com", False),
+        ("", False),
+    ],
+)
+def test_should_classify_live_call_hosts(host, expected):
+    assert _is_live_call_host(host) is expected
+
+
+# ---------------------------------------------------------------------------
+# Verdict classification
+# ---------------------------------------------------------------------------
+
+
+def _cassette(played: int, dirty: bool, total: int):
+    class _Sized:
+        def __init__(self, n):
+            self.n = n
+            self.play_count = played
+            self.dirty = dirty
+
+        def __len__(self):
+            return self.n
+
+    return _Sized(total)
+
+
+def test_should_classify_pure_replay_as_hit():
+    assert (
+        _classify_marked_test(_cassette(played=3, dirty=False, total=3)) == VERDICT_HIT
+    )
+
+
+def test_should_classify_no_traffic_as_noop():
+    assert (
+        _classify_marked_test(_cassette(played=0, dirty=False, total=0))
+        == VERDICT_NOOP_NO_TRAFFIC
+    )
+
+
+def test_should_classify_pure_record_as_miss_recorded():
+    assert (
+        _classify_marked_test(_cassette(played=0, dirty=True, total=1))
+        == VERDICT_MISS_RECORDED
+    )
+
+
+def test_should_classify_mixed_replay_and_record_as_partial():
+    assert (
+        _classify_marked_test(_cassette(played=2, dirty=True, total=4))
+        == VERDICT_PARTIAL
+    )
+
+
+def test_should_classify_overflow_as_miss_overflow_regardless_of_play_state():
+    """Cassettes that exceed ``MAX_EPISODES_PER_CASSETTE`` (50) are
+    refused for save — they will hit live every CI run, so the verdict
+    must override HIT/PARTIAL classification."""
+    assert (
+        _classify_marked_test(_cassette(played=0, dirty=True, total=51))
+        == VERDICT_MISS_OVERFLOW
+    )
+    assert (
+        _classify_marked_test(_cassette(played=10, dirty=True, total=52))
+        == VERDICT_MISS_OVERFLOW
+    )
+
+
+# ---------------------------------------------------------------------------
+# apply_vcr_auto_marker_to_items: skip-reason tagging
+# ---------------------------------------------------------------------------
+
+
+def _make_module_with_source(tmp_path, src: str, name: str):
+    p = tmp_path / f"{name}.py"
+    p.write_text(src)
+    mod = SimpleNamespace(__file__=str(p))
+    return mod, str(p)
+
+
+def test_should_apply_vcr_marker_to_clean_test(vcr_enabled, tmp_path):
+    mod, p = _make_module_with_source(tmp_path, "def test_x(): pass\n", "clean")
+    item = _StubItem("clean.py::test_x", p, module=mod)
+    apply_vcr_auto_marker_to_items([item])
+    assert item.get_closest_marker("vcr") == "vcr"
+
+
+def test_should_skip_per_item_when_respx_marker_present(vcr_enabled, tmp_path):
+    mod, p = _make_module_with_source(tmp_path, "def test_x(): pass\n", "respx_marker")
+    item = _StubItem("respx_marker.py::test_x", p, markers=["respx"], module=mod)
+    apply_vcr_auto_marker_to_items([item])
+    assert item.get_closest_marker("vcr") is None
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_RESPX
+
+
+def test_should_skip_per_item_when_respx_mock_fixture_present(vcr_enabled, tmp_path):
+    mod, p = _make_module_with_source(tmp_path, "def test_x(): pass\n", "respx_fixture")
+    item = _StubItem(
+        "respx_fixture.py::test_x", p, fixturenames=["respx_mock"], module=mod
+    )
+    apply_vcr_auto_marker_to_items([item])
+    assert item.get_closest_marker("vcr") is None
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_RESPX
+
+
+def test_should_tag_pre_marked_items_so_summary_can_show_them(vcr_enabled, tmp_path):
+    mod, p = _make_module_with_source(tmp_path, "def test_x(): pass\n", "premarked")
+    item = _StubItem("premarked.py::test_x", p, markers=["vcr"], module=mod)
+    apply_vcr_auto_marker_to_items([item])
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_PRE_MARKED
+
+
+def test_should_tag_skip_files_with_respx_module_when_module_actually_uses_respx(
+    vcr_enabled, tmp_path
+):
+    """A file in ``skip_files`` whose module *does* call respx should be
+    labeled as a real conflict (respx_conflict_module), not a dead opt-out."""
+    mod, p = _make_module_with_source(
+        tmp_path,
+        "import respx\n@pytest.mark.respx\ndef test_x(): pass\n",
+        "real_respx",
+    )
+    item = _StubItem("real_respx.py::test_x", p, module=mod)
+    apply_vcr_auto_marker_to_items([item], skip_files={"real_respx.py"})
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_RESPX_MODULE
+
+
+def test_should_tag_skip_files_with_file_opt_out_when_module_does_not_use_respx(
+    vcr_enabled, tmp_path
+):
+    """A file in ``skip_files`` whose module never wires up respx is a
+    dead skip-list entry — surface it so we can prune."""
+    mod, p = _make_module_with_source(
+        tmp_path,
+        "from respx import MockRouter  # dead import\ndef test_x(): pass\n",
+        "dead_skip",
+    )
+    item = _StubItem("dead_skip.py::test_x", p, module=mod)
+    apply_vcr_auto_marker_to_items([item], skip_files={"dead_skip.py"})
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_FILE_OPT_OUT
+
+
+def test_should_tag_nodeid_suffix_skips_as_incompatible(vcr_enabled, tmp_path):
+    mod, p = _make_module_with_source(tmp_path, "def test_x(): pass\n", "incompat")
+    item = _StubItem("incompat.py::test_prompt_caching", p, module=mod)
+    apply_vcr_auto_marker_to_items(
+        [item], skip_nodeid_suffixes=("::test_prompt_caching",)
+    )
+    assert getattr(item, VCR_SKIP_REASON_USER_ATTR) == SKIP_REASON_INCOMPATIBLE
+
+
+# ---------------------------------------------------------------------------
+# Session-end summary
+# ---------------------------------------------------------------------------
+
+
+class _FakeReporter:
+    def __init__(self):
+        self.lines: list[str] = []
+
+    def write_sep(self, sep, title="", **kwargs):
+        self.lines.append(f"=== {title}" if title else "===")
+
+    def write_line(self, line):
+        self.lines.append(line)
+
+    @property
+    def output(self):
+        return "\n".join(self.lines)
+
+
+def test_should_render_overflow_section_when_any_test_overflowed(vcr_enabled):
+    """The OVERFLOW section is the cost-leak signal: if it's empty, no
+    cassettes are silently being refused; if it's not empty, those tests
+    re-bill on every run."""
+    request = SimpleNamespace(
+        node=SimpleNamespace(
+            nodeid="t::overflow",
+            user_properties=[],
+            rep_call=SimpleNamespace(passed=True),
+        )
+    )
+    cassette = _cassette(played=0, dirty=True, total=51)
+    cassette._path = None  # avoid mark_test_outcome side-effects
+    record_vcr_outcome(request, cassette)
+
+    reporter = _FakeReporter()
+    emit_vcr_classification_summary(reporter)
+    assert "VCR CACHE CLASSIFICATION SUMMARY" in reporter.output
+    assert "VCR MISS:OVERFLOW" in reporter.output
+    assert "CASSETTE OVERFLOW" in reporter.output
+    assert "t::overflow" in reporter.output
+
+
+def test_should_render_unmarked_live_call_section_with_hosts(vcr_enabled):
+    request_node = SimpleNamespace(
+        nodeid="t::leak",
+        user_properties=[],
+        rep_call=SimpleNamespace(passed=True),
+    )
+    setattr(request_node, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_RESPX)
+    setattr(request_node, "vcr_live_call_hosts", ["api.openai.com"])
+    request = SimpleNamespace(node=request_node)
+
+    record_vcr_outcome(request, None)
+
+    snap = session_stats_snapshot()
+    assert snap["unmarked_live_call_tests"] == [("t::leak", ["api.openai.com"])]
+    assert snap["verdict_counts"][VERDICT_UNMARKED_LIVE_CALL] == 1
+
+    reporter = _FakeReporter()
+    emit_vcr_classification_summary(reporter)
+    assert "UNMARKED TESTS WITH LIVE API CALLS" in reporter.output
+    assert "api.openai.com" in reporter.output
+    assert "t::leak" in reporter.output
+
+
+def test_should_record_unmarked_no_traffic_when_test_skipped_vcr_but_did_not_call_out(
+    vcr_enabled,
+):
+    request_node = SimpleNamespace(
+        nodeid="t::clean_skip",
+        user_properties=[],
+        rep_call=SimpleNamespace(passed=True),
+    )
+    setattr(request_node, VCR_SKIP_REASON_USER_ATTR, SKIP_REASON_INCOMPATIBLE)
+    request = SimpleNamespace(node=request_node)
+
+    record_vcr_outcome(request, None)
+
+    snap = session_stats_snapshot()
+    assert snap["verdict_counts"][VERDICT_UNMARKED_NO_TRAFFIC] == 1
+    assert snap["skip_reason_counts"][SKIP_REASON_INCOMPATIBLE] == 1
+
+
+def test_should_demote_miss_recorded_to_not_persisted_when_test_failed(vcr_enabled):
+    """If a test failed, ``save_cassette`` skips persisting — that means
+    the next CI run will hit live again. The verdict must reflect that."""
+    request = SimpleNamespace(
+        node=SimpleNamespace(
+            nodeid="t::failed",
+            user_properties=[],
+            rep_call=SimpleNamespace(passed=False),
+        )
+    )
+    cassette = _cassette(played=0, dirty=True, total=1)
+    cassette._path = None
+    record_vcr_outcome(request, cassette)
+
+    snap = session_stats_snapshot()
+    assert snap["verdict_counts"].get(VERDICT_MISS_NOT_PERSISTED) == 1
+
+
+def test_should_emit_no_summary_when_no_tests_observed(vcr_enabled):
+    reporter = _FakeReporter()
+    emit_vcr_classification_summary(reporter)
+    assert reporter.output == ""
+
+
+# ---------------------------------------------------------------------------
+# Live-call probe
+# ---------------------------------------------------------------------------
+
+
+def test_should_skip_live_probe_when_vcr_active(vcr_enabled):
+    """When the test *is* VCR-marked (cassette truthy), we don't install
+    the probe — vcrpy intercepts above the socket layer, so any
+    'connection' would be vcrpy's own bookkeeping and not real spend."""
+    request = SimpleNamespace(node=SimpleNamespace(), addfinalizer=lambda fn: None)
+    fake_cassette = SimpleNamespace(play_count=0, dirty=False)
+    probe = install_live_call_probe(request, fake_cassette)
+    assert probe is None
+
+
+def test_live_call_probe_records_known_llm_hosts(vcr_enabled, monkeypatch):
+    """The probe should record outbound TCP connections to known LLM
+    provider hosts (and ignore localhost / RFC1918 / unknown hosts)."""
+    finalizers = []
+
+    class _Node:
+        pass
+
+    request = SimpleNamespace(
+        node=_Node(), addfinalizer=lambda fn: finalizers.append(fn)
+    )
+    probe = install_live_call_probe(request, None)
+    assert probe is not None
+
+    import socket
+
+    # Manually invoke the patched function — we don't actually open a
+    # connection because that would hit the network. The probe records
+    # at the *call site* before delegating, and the original
+    # ``socket.create_connection`` will then fail; we swallow that.
+    try:
+        socket.create_connection(("api.openai.com", 443), timeout=0.001)
+    except Exception:
+        pass
+    try:
+        socket.create_connection(("127.0.0.1", 6379), timeout=0.001)
+    except Exception:
+        pass
+
+    # Restore via finalizers before asserting so the rest of the test
+    # session is unaffected.
+    for fn in finalizers:
+        fn()
+
+    hosts = getattr(request.node, "vcr_live_call_hosts", [])
+    assert "api.openai.com" in hosts
+    assert "127.0.0.1" not in hosts
@@ -25,20 +25,21 @@ import litellm
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
 )

-# vcrpy and respx both patch the httpx transport — applying both makes one
-# silently win, so respx-using files opt out of the auto-marker.
-_RESPX_CONFLICTING_FILES = frozenset(
-    {
-        "test_router.py",
-        "test_amazing_vertex_completion.py",
-        "test_azure_openai.py",
-    }
-)
+# Per-item respx detection (``apply_vcr_auto_marker_to_items``) auto-skips
+# tests whose ``@pytest.mark.respx`` marker or ``respx_mock`` fixture
+# would conflict with vcrpy's transport patch. We no longer maintain a
+# file-level ``_RESPX_CONFLICTING_FILES`` list here — the previous
+# entries (``test_router.py``) had only a stale ``from respx import
+# MockRouter`` import with no actual respx wiring, so file-level
+# blacklisting was masking valid cache opportunities.

 # Files where VCR replay breaks the test:
 # - ``test_assistants.py``: polls fresh per-session run IDs that no cassette
@@ -76,6 +77,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -88,6 +90,11 @@ def pytest_runtest_logreport(report):
    _verbose_state.maybe_emit_verdict(report)


+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
+
+
 # ---------------------------------------------------------------------------
 # Capture TRUE defaults at conftest import time.  This runs before any test
 # module's top-level code (e.g. `litellm.num_retries = 3`) executes, so
@@ -215,7 +222,7 @@ def setup_and_teardown():
 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(
        items,
-        skip_files=_RESPX_CONFLICTING_FILES | _VCR_INCOMPATIBLE_FILES,
+        skip_files=_VCR_INCOMPATIBLE_FILES,
        skip_nodeid_suffixes=_VCR_INCOMPATIBLE_NODEID_SUFFIXES,
    )

@@ -22,6 +22,9 @@ import litellm
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -69,6 +72,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -220,3 +224,8 @@ def pytest_collection_modifyitems(config, items):

    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -15,6 +15,9 @@ sys.path.insert(0, os.path.abspath("../.."))
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -41,6 +44,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -55,3 +59,8 @@ def pytest_runtest_logreport(report):

 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(items)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -8,6 +8,9 @@ sys.path.insert(0, os.path.abspath("../.."))
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -34,6 +37,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -48,3 +52,8 @@ def pytest_runtest_logreport(report):

 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(items)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -15,6 +15,9 @@ import litellm  # noqa: E402,F401
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -87,6 +90,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -114,3 +118,8 @@ def pytest_collection_modifyitems(config, items):

    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -16,6 +16,9 @@ sys.path.insert(0, os.path.abspath("../.."))
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -42,6 +45,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -56,3 +60,8 @@ def pytest_runtest_logreport(report):

 def pytest_collection_modifyitems(config, items):
    apply_vcr_auto_marker_to_items(items)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)
@@ -15,6 +15,9 @@ import litellm  # noqa: E402,F401
 from tests._vcr_conftest_common import (  # noqa: E402
    VerboseReporterState,
    apply_vcr_auto_marker_to_items,
+    emit_cassette_cache_session_banner,
+    emit_vcr_classification_summary,
+    install_live_call_probe,
    record_vcr_outcome,
    register_persister_if_enabled,
    vcr_config_dict,
@@ -74,6 +77,7 @@ def pytest_runtest_makereport(item, call):

@pytest.fixture(autouse=True)
 def _vcr_outcome_gate(request, vcr):
+    install_live_call_probe(request, vcr)
    yield
    record_vcr_outcome(request, vcr)

@@ -101,3 +105,8 @@ def pytest_collection_modifyitems(config, items):

    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    emit_cassette_cache_session_banner(terminalreporter)
+    emit_vcr_classification_summary(terminalreporter)