Files
litellm/tests/test_litellm/proxy/test_proxy_logging_hook_detection.py
T
Yassin Kortam a6494e6fe3 perf: eliminate per-request callback scanning on proxy hot path (#27858)
- Introduce `_CallbackCapabilities` dataclass and `ProxyLogging._callback_capabilities()` static method that inspects `litellm.callbacks` once and caches capability flags keyed on (list length, member ids); invalidates automatically when the callback list mutates without per-request iteration overhead
- Replace O(n) `litellm.callbacks` walks in `async_pre_call_hook`, `during_call_hook`, `async_post_call_streaming_iterator_hook`, `async_post_call_streaming_hook`, and `post_call_response_headers_hook` with fast-path exits when no relevant callbacks are registered
- Add `needs_iterator_wrap()` and `needs_per_chunk_streaming_hook()` instance methods to decouple iterator-level wrapping from per-chunk hook execution; avoids `get_response_string` materialization per chunk when no guardrail or chunk-hook callback is active
- Introduce `_fast_serialize_simple_model_response_stream()` using `orjson` for common single-choice text streaming chunks, bypassing the full Pydantic serializer; falls back to `model_dump_json` for tool calls, logprobs, usage, and provider-specific fields
- Add early-return in `_restamp_streaming_chunk_model` when downstream model already matches the requested model, avoiding unnecessary string comparisons on every chunk
- Fix stale zero-cost cache bug in `_is_model_cost_zero`: move the per-router `_zero_cost_cache` dict onto the `Router` instance and clear it in `_invalidate_model_group_info_cache` so in-place pricing updates via `upsert_deployment` immediately resume budget enforcement
- Add `scripts/benchmark_chat_completions_perf.py`: standalone async benchmarking tool with a mock OpenAI provider, LiteLLM proxy process management, non-streaming RPS, streaming TTFT, and full-stream latency measurements with repeat/median run support
- Add comprehensive unit tests covering capability detection, cache invalidation, fast-path correctness, zero-cost cache regression, and the no-callback streaming fast path

Co-authored-by: Yassin Kortam <yassinkortam@g.ucla.edu>
2026-05-14 09:28:31 -07:00

129 lines
4.7 KiB
Python

import pytest
import litellm
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy.utils import ProxyLogging
def test_has_post_call_response_headers_callbacks_ignores_empty_callbacks(
monkeypatch,
):
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
def test_has_post_call_response_headers_callbacks_requires_override(
monkeypatch,
):
"""A vanilla ``CustomLogger`` inherits the no-op response-headers hook;
the capability flag must stay False so the proxy can skip the headers
loop entirely. Only callbacks that *override* the hook should flip it."""
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
class _AddsHeaders(CustomLogger):
async def async_post_call_response_headers_hook(self, **kwargs):
return {"x-custom": "1"}
monkeypatch.setattr(litellm, "callbacks", [_AddsHeaders()])
assert ProxyLogging.has_post_call_response_headers_callbacks() is True
def test_has_streaming_callbacks_uses_custom_logger_detection(monkeypatch):
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging.has_streaming_callbacks() is False
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
assert ProxyLogging.has_streaming_callbacks() is False
class StreamingLogger(CustomLogger):
async def async_post_call_streaming_hook(self, **kwargs):
return kwargs.get("response")
monkeypatch.setattr(litellm, "callbacks", [StreamingLogger()])
assert ProxyLogging.has_streaming_callbacks() is True
def test_has_streaming_callbacks_detects_guardrails(monkeypatch):
monkeypatch.setattr(litellm, "callbacks", [CustomGuardrail()])
assert ProxyLogging.has_streaming_callbacks() is True
@pytest.mark.asyncio
async def test_post_call_response_headers_hook_returns_early_without_callbacks(
monkeypatch,
):
monkeypatch.setattr(litellm, "callbacks", [])
proxy_logging_obj = ProxyLogging(user_api_key_cache={}) # type: ignore[arg-type]
result = await proxy_logging_obj.post_call_response_headers_hook(
data={},
user_api_key_dict=None, # type: ignore[arg-type]
response=None,
request_headers={},
)
assert result == {}
def test_callback_capabilities_skips_default_custom_logger(monkeypatch):
"""
Internal proxy hooks (e.g. _PROXY_MaxBudgetLimiter, ManagedFiles) inherit
the default ``async_post_call_streaming_iterator_hook`` body. The
capability scanner must NOT report them as iterator overrides — wrapping
the chunk stream through every no-op layer was responsible for ~10x
streaming overhead on default deployments.
"""
class _InternalNoopHook(CustomLogger):
pass
monkeypatch.setattr(litellm, "callbacks", [_InternalNoopHook()])
caps = ProxyLogging._callback_capabilities()
# Subclass inherits the base no-op for every hook — every capability flag
# must stay False so the proxy short-circuits the corresponding loops.
assert caps.has_post_call_response_headers is False
assert caps.iterator_overrides == ()
assert caps.has_iterator_override is False
assert caps.has_streaming_chunk_override is False
assert caps.has_guardrail is False
def test_callback_capabilities_captures_iterator_override(monkeypatch):
class _OverridesIterator(CustomLogger):
async def async_post_call_streaming_iterator_hook( # type: ignore[override]
self, user_api_key_dict, response, request_data
):
async for item in response:
yield item
override = _OverridesIterator()
monkeypatch.setattr(litellm, "callbacks", [override])
caps = ProxyLogging._callback_capabilities()
assert caps.has_iterator_override is True
assert len(caps.iterator_overrides) == 1
resolved, kind = caps.iterator_overrides[0]
assert resolved is override
assert kind == "override"
def test_callback_capabilities_cache_invalidates_on_list_change(monkeypatch):
"""The cache key includes (length, id-of-each-callback). Mutating the
callback list must produce a fresh capability snapshot."""
monkeypatch.setattr(litellm, "callbacks", [])
assert ProxyLogging._callback_capabilities().resolved_callbacks == ()
class _OverridesPreCall(CustomLogger):
async def async_pre_call_hook(self, *args, **kwargs):
return kwargs.get("data")
pre = _OverridesPreCall()
monkeypatch.setattr(litellm, "callbacks", [pre])
caps = ProxyLogging._callback_capabilities()
assert caps.has_pre_call_override is True
assert pre in caps.resolved_callbacks