Files
litellm/tests/logging_callback_tests/conftest.py
T
Cursor Agent b637d9f64a test(vcr): classify cache verdicts, detect live calls, surface cost leaks
Convert the per-test VCR verdict line from a single 'NOOP / HIT / MISS /
PARTIAL' tag into a classified outcome that distinguishes the cases that
silently bill the live API on every CI run from the ones that don't:

  HIT                         pure replay
  PARTIAL                     mixed replay + new recordings
  MISS:RECORDED               new cassette saved to Redis (cached next run)
  MISS:OVERFLOW               cassette > MAX_EPISODES_PER_CASSETTE; persister
                              refused to save; re-bills every run
  MISS:NOT_PERSISTED          test failed; save_cassette skipped; re-bills
  NOOP                        VCR-marked but no HTTP traffic (mocked elsewhere)
  UNMARKED:LIVE_CALL          test bypassed VCR AND opened a TCP connection
                              to a known LLM provider host -> wasted spend
  UNMARKED:NO_TRAFFIC         test bypassed VCR but didn't call out

The UNMARKED:LIVE_CALL signal is what converts 'this test probably hits
live' into 'this test connected to api.openai.com'. We install a
socket.connect / socket.create_connection wrapper for the duration of
each non-VCR-marked test and record any outbound TCP to a known LLM
provider hostname. The probe sits below the httpx layer so vcrpy and
respx (which both patch above the socket) are unaffected.

Replace the file-level _RESPX_CONFLICTING_FILES blacklists in the
llm_translation and local_testing conftests with per-item respx
detection in apply_vcr_auto_marker_to_items. A test now skips VCR when
it actually carries @pytest.mark.respx or has respx_mock in its fixture
chain - not just because some other test in the same file imports
MockRouter. Items skipped by skip_files are split into respx_conflict
(real conflict, the module wires up respx) vs file_opt_out (dead skip-
list entry whose module never touches respx) so the session summary
makes pruning obvious.

Stabilize the AWS SigV4 fingerprint: the Authorization header on
Bedrock requests rotates its Credential date and Signature on every
call, which previously pushed every Bedrock test past the 50-episode
overflow threshold. Extract the access-key id only
('aws-sigv4:AKIA...') so two requests with the same identity match.

Always emit verdict logging when VCR is active (set
LITELLM_VCR_VERBOSE=0 to opt back into the legacy quiet mode). Add a
session-end classification summary that lists overflow tests, unmarked
live-call tests, and the skip-reason breakdown.

Wire the live-call probe + summary hook into every test directory that
already uses the Redis-backed VCR cache (audio_tests, guardrails_tests,
image_gen_tests, litellm_utils_tests, llm_responses_api_testing,
llm_translation, local_testing, logging_callback_tests, ocr_tests,
pass_through_unit_tests, router_unit_tests, search_tests,
unified_google_tests).

Add tests/llm_translation/test_vcr_classification.py covering the
verdict classifier, skip-reason tagging, AWS SigV4 fingerprint stability,
live-host classification, and session summary rendering.

Co-authored-by: Mateo Wang <mateo-berri@users.noreply.github.com>
2026-05-13 00:31:47 +00:00

232 lines
6.8 KiB
Python

# conftest.py
#
# xdist-compatible test isolation for logging callback tests.
#
# Key design: capture litellm's true default values at conftest import time
# (BEFORE test modules are imported) so we can reset to clean defaults before
# each test. This is necessary because some test modules set module-level
# globals like `litellm.num_retries = 3` which pollute state for all tests
# in the same xdist worker.
import importlib
import os
import sys
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from tests._vcr_conftest_common import ( # noqa: E402
VerboseReporterState,
apply_vcr_auto_marker_to_items,
emit_cassette_cache_session_banner,
emit_vcr_classification_summary,
install_live_call_probe,
record_vcr_outcome,
register_persister_if_enabled,
vcr_config_dict,
)
# vcrpy and respx both patch the httpx transport — applying both makes one
# silently win, so respx-using files opt out of the auto-marker.
_RESPX_CONFLICTING_FILES = frozenset(
{
"test_assemble_streaming_responses.py",
"test_langfuse_unit_tests.py",
}
)
# Files where VCR replay breaks the test:
# - ``test_amazing_s3_logs.py``: vcrpy's boto3 stub intercepts a real S3
# PUT/LIST round-trip the test asserts on, so the per-run id is never found.
_VCR_INCOMPATIBLE_FILES = frozenset(
{
"test_amazing_s3_logs.py",
}
)
_VCR_INCOMPATIBLE_NODEID_SUFFIXES: tuple[str, ...] = ()
_verbose_state = VerboseReporterState()
@pytest.fixture(scope="module")
def vcr_config():
return vcr_config_dict()
def pytest_recording_configure(config, vcr):
register_persister_if_enabled(vcr)
@pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
outcome = yield
rep = outcome.get_result()
setattr(item, f"rep_{rep.when}", rep)
@pytest.fixture(autouse=True)
def _vcr_outcome_gate(request, vcr):
install_live_call_probe(request, vcr)
yield
record_vcr_outcome(request, vcr)
def pytest_configure(config):
_verbose_state.remember_pluginmanager(config)
def pytest_runtest_logreport(report):
_verbose_state.maybe_emit_verdict(report)
_LIST_ATTRS = (
"callbacks",
"success_callback",
"failure_callback",
"_async_success_callback",
"_async_failure_callback",
"service_callback",
"pre_call_rules",
"post_call_rules",
)
_SCALAR_ATTRS = (
"set_verbose",
"cache",
"num_retries",
"num_retries_per_request",
"turn_off_message_logging",
"redact_messages_in_exceptions",
"redact_user_api_key_info",
"s3_callback_params",
"s3_audit_callback_params",
"datadog_params",
"vector_store_registry",
)
# ---- Capture true defaults at conftest import time ----
# This runs BEFORE any test modules are imported, so values are clean.
_DEFAULTS: dict = {}
for _attr in _LIST_ATTRS:
if hasattr(litellm, _attr):
_val = getattr(litellm, _attr)
_DEFAULTS[_attr] = _val.copy() if isinstance(_val, list) else _val
for _attr in _SCALAR_ATTRS:
if hasattr(litellm, _attr):
_DEFAULTS[_attr] = getattr(litellm, _attr)
@pytest.fixture(scope="function", autouse=True)
def isolate_litellm_state():
"""
Per-function isolation fixture.
Resets litellm state to the true defaults captured at conftest import time,
then restores after the test. This prevents module-level mutations (e.g.
`litellm.num_retries = 3` at the top of test_langfuse_e2e_test.py) from
leaking across tests within the same xdist worker.
"""
from litellm.litellm_core_utils import litellm_logging as ll_logging
from litellm.proxy.management_helpers import audit_logs as ll_audit_logs
# Flush cache and clear internal logger instances before test
if hasattr(litellm, "in_memory_llm_clients_cache"):
litellm.in_memory_llm_clients_cache.flush_cache()
# Clear cached logger instances (LangsmithLogger, SlackAlerting, etc.)
ll_logging._in_memory_loggers.clear()
ll_audit_logs._audit_log_callback_cache.clear()
# Reset ALL attrs to their true defaults before the test runs.
# This undoes any module-level mutations from test file imports.
for attr in _LIST_ATTRS:
if attr in _DEFAULTS:
default = _DEFAULTS[attr]
setattr(
litellm, attr, default.copy() if isinstance(default, list) else default
)
for attr in _SCALAR_ATTRS:
if attr in _DEFAULTS:
setattr(litellm, attr, _DEFAULTS[attr])
yield
# Teardown: reset back to defaults again (belt-and-suspenders)
if hasattr(litellm, "in_memory_llm_clients_cache"):
litellm.in_memory_llm_clients_cache.flush_cache()
ll_logging._in_memory_loggers.clear()
ll_audit_logs._audit_log_callback_cache.clear()
for attr in _LIST_ATTRS:
if attr in _DEFAULTS:
default = _DEFAULTS[attr]
setattr(
litellm, attr, default.copy() if isinstance(default, list) else default
)
for attr in _SCALAR_ATTRS:
if attr in _DEFAULTS:
setattr(litellm, attr, _DEFAULTS[attr])
@pytest.fixture(scope="module", autouse=True)
def setup_and_teardown():
"""
Module-scoped setup. Reloads litellm only in single-process mode
(skipped under xdist to avoid cross-worker interference).
"""
sys.path.insert(0, os.path.abspath("../.."))
import litellm
worker_id = os.environ.get("PYTEST_XDIST_WORKER", None)
if worker_id is None:
importlib.reload(litellm)
try:
if hasattr(litellm, "proxy") and hasattr(litellm.proxy, "proxy_server"):
import litellm.proxy.proxy_server
importlib.reload(litellm.proxy.proxy_server)
except Exception as e:
print(f"Error reloading litellm.proxy.proxy_server: {e}")
if hasattr(litellm, "in_memory_llm_clients_cache"):
litellm.in_memory_llm_clients_cache.flush_cache()
yield
def pytest_collection_modifyitems(config, items):
apply_vcr_auto_marker_to_items(
items,
skip_files=_RESPX_CONFLICTING_FILES | _VCR_INCOMPATIBLE_FILES,
skip_nodeid_suffixes=_VCR_INCOMPATIBLE_NODEID_SUFFIXES,
)
# Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
custom_logger_tests = [
item for item in items if "custom_logger" in item.parent.name
]
other_tests = [item for item in items if "custom_logger" not in item.parent.name]
# Sort tests based on their names
custom_logger_tests.sort(key=lambda x: x.name)
other_tests.sort(key=lambda x: x.name)
# Reorder the items list
items[:] = custom_logger_tests + other_tests
def pytest_terminal_summary(terminalreporter, exitstatus, config):
emit_cassette_cache_session_banner(terminalreporter)
emit_vcr_classification_summary(terminalreporter)