Files
litellm/tests/test_litellm/proxy/test_proxy_logging_hook_detection.py
T
Yassin Kortam 2eab9ee2c0 perf: reduce per-request and per-chunk overhead across Anthropic streaming hot paths (#28289)
* perf: reduce per-request and per-chunk overhead across Anthropic streaming hot paths

- Introduce pure-text fast-path in `_build_complete_streaming_response` that collapses O(N) `content_block_delta` events into a single equivalent SSE event before conversion, eliminating per-output-token Pydantic `ModelResponseStream` construction; non-text streams (tool_use, thinking, citations) fall back to the unchanged legacy path
- Skip agentic streaming wrapper entirely when no callback overrides `async_should_run_agentic_loop`; the wrapper buffered every chunk and rebuilt the SSE response only to call hooks that all return `(False, {})` — a pure no-op for the default config
- Serialize request body once (`json.dumps`) for both the pre-call log input and the wire, instead of twice; avoids a full O(payload) scan per request, significant for long-context Claude Code histories
- Add fast path in `async_streaming_data_generator` that bypasses the per-chunk `async_post_call_streaming_hook` coroutine await, response-string materialization, and cost-injection call when no callback/guardrail/cost-injection is active (the default config)
- Resolve `_DD_STREAMING_TRACE_ENABLED` once at import time; eliminate per-chunk `NullSpan` context manager allocation when Datadog tracing is disabled (the default)
- Memoize `get_type_hints(AnthropicMessagesRequestOptionalParams)` with `@lru_cache(maxsize=1)` — resolves once per process instead of once per `/v1/messages` request (~80µs each)
- Hoist `cost_injection_active` out of the per-chunk loop in `chunk_processor`; eliminates repeated `getattr` + endpoint-type checks on every streamed byte chunk
- Extract `_build_passthrough_logging_result` from `_route_streaming_logging_to_handler` as a standalone static method to facilitate future off-loop dispatch
- Convert `async_sse_data_generator` from an `async for: yield` trampoline to a direct return of the underlying generator, removing one async-generator layer per streamed chunk
- Skip redundant `strip_empty_text_blocks_from_anthropic_messages` scan in `anthropic_messages_handler` when the async wrapper already sanitized (signalled via `_litellm_messages_presanitized` sentinel, popped before reaching provider params)
- Gate debug log `f-string` evaluation behind `isEnabledFor(DEBUG)` in both the streaming generator and the transformation layer to avoid serializing entire message payloads on every request at non-debug log levels
- Add benchmark script (`scripts/benchmark_anthropic_messages_perf.py`) with a local mock Anthropic SSE provider for reproducible TTFT and TPM measurement across commits/branches
- Add parity tests asserting fast-path and legacy-path produce byte-identical logged/billed payloads, plus unit tests for agentic hook detection, pre-serialized body reuse, and memoized key resolution

* perf: address greptile review for anthropic streaming hot path

- Bail to legacy in `_collapse_pure_text_chunks` when content_block_delta
  events from different block indexes are observed without an intervening
  flush. Anthropic sends blocks strictly sequentially, but defensive bail
  prevents silent text-merging if the protocol ever interleaves.
- Replace leaf-class `__dict__` check for `async_post_call_streaming_hook`
  in `_callback_capabilities` with a function-identity comparison that
  walks the MRO. A vendor base class can carry the override and the
  registered class can add nothing else; before this PR the hook was
  unconditionally invoked, so an inherited-override miss would silently
  drop the hook on the streaming path.
- Add unit tests for both behaviors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(mypy): narrow model_name to str in cost-injection branch

The hoisted cost_injection_active flag in chunk_processor encodes the
`bool(model_name)` requirement but mypy can't track that invariant
through the local, so the per-chunk `_process_chunk_with_cost_injection(
chunk, model_name)` calls flagged Optional[str] vs str. Pin a typed
non-None local inside the cost-injection branch so mypy narrows
correctly without changing runtime behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Yassin Kortam <yassinkortam@g.ucla.edu>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 12:15:59 -07:00

151 lines
5.6 KiB
Python

import pytest
import litellm
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy.utils import ProxyLogging
def test_has_post_call_response_headers_callbacks_ignores_empty_callbacks(
monkeypatch,
):
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
def test_has_post_call_response_headers_callbacks_requires_override(
monkeypatch,
):
"""A vanilla ``CustomLogger`` inherits the no-op response-headers hook;
the capability flag must stay False so the proxy can skip the headers
loop entirely. Only callbacks that *override* the hook should flip it."""
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
class _AddsHeaders(CustomLogger):
async def async_post_call_response_headers_hook(self, **kwargs):
return {"x-custom": "1"}
monkeypatch.setattr(litellm, "callbacks", [_AddsHeaders()])
assert ProxyLogging.has_post_call_response_headers_callbacks() is True
def test_has_streaming_callbacks_uses_custom_logger_detection(monkeypatch):
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging.has_streaming_callbacks() is False
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
assert ProxyLogging.has_streaming_callbacks() is False
class StreamingLogger(CustomLogger):
async def async_post_call_streaming_hook(self, **kwargs):
return kwargs.get("response")
monkeypatch.setattr(litellm, "callbacks", [StreamingLogger()])
assert ProxyLogging.has_streaming_callbacks() is True
def test_has_streaming_callbacks_detects_guardrails(monkeypatch):
monkeypatch.setattr(litellm, "callbacks", [CustomGuardrail()])
assert ProxyLogging.has_streaming_callbacks() is True
@pytest.mark.asyncio
async def test_post_call_response_headers_hook_returns_early_without_callbacks(
monkeypatch,
):
monkeypatch.setattr(litellm, "callbacks", [])
proxy_logging_obj = ProxyLogging(user_api_key_cache={}) # type: ignore[arg-type]
result = await proxy_logging_obj.post_call_response_headers_hook(
data={},
user_api_key_dict=None, # type: ignore[arg-type]
response=None,
request_headers={},
)
assert result == {}
def test_callback_capabilities_skips_default_custom_logger(monkeypatch):
"""
Internal proxy hooks (e.g. _PROXY_MaxBudgetLimiter, ManagedFiles) inherit
the default ``async_post_call_streaming_iterator_hook`` body. The
capability scanner must NOT report them as iterator overrides — wrapping
the chunk stream through every no-op layer was responsible for ~10x
streaming overhead on default deployments.
"""
class _InternalNoopHook(CustomLogger):
pass
monkeypatch.setattr(litellm, "callbacks", [_InternalNoopHook()])
caps = ProxyLogging._callback_capabilities()
# Subclass inherits the base no-op for every hook — every capability flag
# must stay False so the proxy short-circuits the corresponding loops.
assert caps.has_post_call_response_headers is False
assert caps.iterator_overrides == ()
assert caps.has_iterator_override is False
assert caps.has_streaming_chunk_override is False
assert caps.has_guardrail is False
def test_callback_capabilities_captures_iterator_override(monkeypatch):
class _OverridesIterator(CustomLogger):
async def async_post_call_streaming_iterator_hook( # type: ignore[override]
self, user_api_key_dict, response, request_data
):
async for item in response:
yield item
override = _OverridesIterator()
monkeypatch.setattr(litellm, "callbacks", [override])
caps = ProxyLogging._callback_capabilities()
assert caps.has_iterator_override is True
assert len(caps.iterator_overrides) == 1
resolved, kind = caps.iterator_overrides[0]
assert resolved is override
assert kind == "override"
def test_callback_capabilities_detects_inherited_streaming_chunk_override(monkeypatch):
"""
``async_post_call_streaming_hook`` must be detected even when the override
lives on an intermediate parent class — a vendor base class can carry the
override and the registered class can add nothing else. Before this PR the
hook was unconditionally invoked, so a leaf-class ``__dict__`` miss here
would silently drop the inherited hook.
"""
ProxyLogging._callback_capabilities_cache.clear()
class _StreamingBase(CustomLogger):
async def async_post_call_streaming_hook(self, *args, **kwargs): # type: ignore[override]
return kwargs.get("response")
class _LeafWithoutOverride(_StreamingBase):
pass
monkeypatch.setattr(litellm, "callbacks", [_LeafWithoutOverride()])
caps = ProxyLogging._callback_capabilities()
assert caps.has_streaming_chunk_override is True
def test_callback_capabilities_cache_invalidates_on_list_change(monkeypatch):
"""The cache key includes (length, id-of-each-callback). Mutating the
callback list must produce a fresh capability snapshot."""
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging._callback_capabilities().resolved_callbacks == ()
class _OverridesPreCall(CustomLogger):
async def async_pre_call_hook(self, *args, **kwargs):
return kwargs.get("data")
pre = _OverridesPreCall()
monkeypatch.setattr(litellm, "callbacks", [pre])
caps = ProxyLogging._callback_capabilities()
assert caps.has_pre_call_override is True
assert pre in caps.resolved_callbacks