mirror of
https://github.com/tiennm99/litellm.git
synced 2026-06-17 14:48:44 +00:00
2eab9ee2c0
* perf: reduce per-request and per-chunk overhead across Anthropic streaming hot paths
- Introduce pure-text fast-path in `_build_complete_streaming_response` that collapses O(N) `content_block_delta` events into a single equivalent SSE event before conversion, eliminating per-output-token Pydantic `ModelResponseStream` construction; non-text streams (tool_use, thinking, citations) fall back to the unchanged legacy path
- Skip agentic streaming wrapper entirely when no callback overrides `async_should_run_agentic_loop`; the wrapper buffered every chunk and rebuilt the SSE response only to call hooks that all return `(False, {})` — a pure no-op for the default config
- Serialize request body once (`json.dumps`) for both the pre-call log input and the wire, instead of twice; avoids a full O(payload) scan per request, significant for long-context Claude Code histories
- Add fast path in `async_streaming_data_generator` that bypasses the per-chunk `async_post_call_streaming_hook` coroutine await, response-string materialization, and cost-injection call when no callback/guardrail/cost-injection is active (the default config)
- Resolve `_DD_STREAMING_TRACE_ENABLED` once at import time; eliminate per-chunk `NullSpan` context manager allocation when Datadog tracing is disabled (the default)
- Memoize `get_type_hints(AnthropicMessagesRequestOptionalParams)` with `@lru_cache(maxsize=1)` — resolves once per process instead of once per `/v1/messages` request (~80µs each)
- Hoist `cost_injection_active` out of the per-chunk loop in `chunk_processor`; eliminates repeated `getattr` + endpoint-type checks on every streamed byte chunk
- Extract `_build_passthrough_logging_result` from `_route_streaming_logging_to_handler` as a standalone static method to facilitate future off-loop dispatch
- Convert `async_sse_data_generator` from an `async for: yield` trampoline to a direct return of the underlying generator, removing one async-generator layer per streamed chunk
- Skip redundant `strip_empty_text_blocks_from_anthropic_messages` scan in `anthropic_messages_handler` when the async wrapper already sanitized (signalled via `_litellm_messages_presanitized` sentinel, popped before reaching provider params)
- Gate debug log `f-string` evaluation behind `isEnabledFor(DEBUG)` in both the streaming generator and the transformation layer to avoid serializing entire message payloads on every request at non-debug log levels
- Add benchmark script (`scripts/benchmark_anthropic_messages_perf.py`) with a local mock Anthropic SSE provider for reproducible TTFT and TPM measurement across commits/branches
- Add parity tests asserting fast-path and legacy-path produce byte-identical logged/billed payloads, plus unit tests for agentic hook detection, pre-serialized body reuse, and memoized key resolution
* perf: address greptile review for anthropic streaming hot path
- Bail to legacy in `_collapse_pure_text_chunks` when content_block_delta
events from different block indexes are observed without an intervening
flush. Anthropic sends blocks strictly sequentially, but defensive bail
prevents silent text-merging if the protocol ever interleaves.
- Replace leaf-class `__dict__` check for `async_post_call_streaming_hook`
in `_callback_capabilities` with a function-identity comparison that
walks the MRO. A vendor base class can carry the override and the
registered class can add nothing else; before this PR the hook was
unconditionally invoked, so an inherited-override miss would silently
drop the hook on the streaming path.
- Add unit tests for both behaviors.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* fix(mypy): narrow model_name to str in cost-injection branch
The hoisted cost_injection_active flag in chunk_processor encodes the
`bool(model_name)` requirement but mypy can't track that invariant
through the local, so the per-chunk `_process_chunk_with_cost_injection(
chunk, model_name)` calls flagged Optional[str] vs str. Pin a typed
non-None local inside the cost-injection branch so mypy narrows
correctly without changing runtime behavior.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: Yassin Kortam <yassinkortam@g.ucla.edu>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
151 lines
5.6 KiB
Python
151 lines
5.6 KiB
Python
import pytest
|
|
|
|
import litellm
|
|
from litellm.integrations.custom_guardrail import CustomGuardrail
|
|
from litellm.integrations.custom_logger import CustomLogger
|
|
from litellm.proxy.utils import ProxyLogging
|
|
|
|
|
|
def test_has_post_call_response_headers_callbacks_ignores_empty_callbacks(
|
|
monkeypatch,
|
|
):
|
|
monkeypatch.setattr(litellm, "callbacks", [])
|
|
|
|
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
|
|
|
|
|
|
def test_has_post_call_response_headers_callbacks_requires_override(
|
|
monkeypatch,
|
|
):
|
|
"""A vanilla ``CustomLogger`` inherits the no-op response-headers hook;
|
|
the capability flag must stay False so the proxy can skip the headers
|
|
loop entirely. Only callbacks that *override* the hook should flip it."""
|
|
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
|
|
assert ProxyLogging.has_post_call_response_headers_callbacks() is False
|
|
|
|
class _AddsHeaders(CustomLogger):
|
|
async def async_post_call_response_headers_hook(self, **kwargs):
|
|
return {"x-custom": "1"}
|
|
|
|
monkeypatch.setattr(litellm, "callbacks", [_AddsHeaders()])
|
|
assert ProxyLogging.has_post_call_response_headers_callbacks() is True
|
|
|
|
|
|
def test_has_streaming_callbacks_uses_custom_logger_detection(monkeypatch):
|
|
monkeypatch.setattr(litellm, "callbacks", [])
|
|
assert ProxyLogging.has_streaming_callbacks() is False
|
|
|
|
monkeypatch.setattr(litellm, "callbacks", [CustomLogger()])
|
|
assert ProxyLogging.has_streaming_callbacks() is False
|
|
|
|
class StreamingLogger(CustomLogger):
|
|
async def async_post_call_streaming_hook(self, **kwargs):
|
|
return kwargs.get("response")
|
|
|
|
monkeypatch.setattr(litellm, "callbacks", [StreamingLogger()])
|
|
assert ProxyLogging.has_streaming_callbacks() is True
|
|
|
|
|
|
def test_has_streaming_callbacks_detects_guardrails(monkeypatch):
|
|
monkeypatch.setattr(litellm, "callbacks", [CustomGuardrail()])
|
|
assert ProxyLogging.has_streaming_callbacks() is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_post_call_response_headers_hook_returns_early_without_callbacks(
|
|
monkeypatch,
|
|
):
|
|
monkeypatch.setattr(litellm, "callbacks", [])
|
|
proxy_logging_obj = ProxyLogging(user_api_key_cache={}) # type: ignore[arg-type]
|
|
|
|
result = await proxy_logging_obj.post_call_response_headers_hook(
|
|
data={},
|
|
user_api_key_dict=None, # type: ignore[arg-type]
|
|
response=None,
|
|
request_headers={},
|
|
)
|
|
|
|
assert result == {}
|
|
|
|
|
|
def test_callback_capabilities_skips_default_custom_logger(monkeypatch):
|
|
"""
|
|
Internal proxy hooks (e.g. _PROXY_MaxBudgetLimiter, ManagedFiles) inherit
|
|
the default ``async_post_call_streaming_iterator_hook`` body. The
|
|
capability scanner must NOT report them as iterator overrides — wrapping
|
|
the chunk stream through every no-op layer was responsible for ~10x
|
|
streaming overhead on default deployments.
|
|
"""
|
|
|
|
class _InternalNoopHook(CustomLogger):
|
|
pass
|
|
|
|
monkeypatch.setattr(litellm, "callbacks", [_InternalNoopHook()])
|
|
|
|
caps = ProxyLogging._callback_capabilities()
|
|
# Subclass inherits the base no-op for every hook — every capability flag
|
|
# must stay False so the proxy short-circuits the corresponding loops.
|
|
assert caps.has_post_call_response_headers is False
|
|
assert caps.iterator_overrides == ()
|
|
assert caps.has_iterator_override is False
|
|
assert caps.has_streaming_chunk_override is False
|
|
assert caps.has_guardrail is False
|
|
|
|
|
|
def test_callback_capabilities_captures_iterator_override(monkeypatch):
|
|
class _OverridesIterator(CustomLogger):
|
|
async def async_post_call_streaming_iterator_hook( # type: ignore[override]
|
|
self, user_api_key_dict, response, request_data
|
|
):
|
|
async for item in response:
|
|
yield item
|
|
|
|
override = _OverridesIterator()
|
|
monkeypatch.setattr(litellm, "callbacks", [override])
|
|
|
|
caps = ProxyLogging._callback_capabilities()
|
|
assert caps.has_iterator_override is True
|
|
assert len(caps.iterator_overrides) == 1
|
|
resolved, kind = caps.iterator_overrides[0]
|
|
assert resolved is override
|
|
assert kind == "override"
|
|
|
|
|
|
def test_callback_capabilities_detects_inherited_streaming_chunk_override(monkeypatch):
|
|
"""
|
|
``async_post_call_streaming_hook`` must be detected even when the override
|
|
lives on an intermediate parent class — a vendor base class can carry the
|
|
override and the registered class can add nothing else. Before this PR the
|
|
hook was unconditionally invoked, so a leaf-class ``__dict__`` miss here
|
|
would silently drop the inherited hook.
|
|
"""
|
|
ProxyLogging._callback_capabilities_cache.clear()
|
|
|
|
class _StreamingBase(CustomLogger):
|
|
async def async_post_call_streaming_hook(self, *args, **kwargs): # type: ignore[override]
|
|
return kwargs.get("response")
|
|
|
|
class _LeafWithoutOverride(_StreamingBase):
|
|
pass
|
|
|
|
monkeypatch.setattr(litellm, "callbacks", [_LeafWithoutOverride()])
|
|
caps = ProxyLogging._callback_capabilities()
|
|
assert caps.has_streaming_chunk_override is True
|
|
|
|
|
|
def test_callback_capabilities_cache_invalidates_on_list_change(monkeypatch):
|
|
"""The cache key includes (length, id-of-each-callback). Mutating the
|
|
callback list must produce a fresh capability snapshot."""
|
|
monkeypatch.setattr(litellm, "callbacks", [])
|
|
assert ProxyLogging._callback_capabilities().resolved_callbacks == ()
|
|
|
|
class _OverridesPreCall(CustomLogger):
|
|
async def async_pre_call_hook(self, *args, **kwargs):
|
|
return kwargs.get("data")
|
|
|
|
pre = _OverridesPreCall()
|
|
monkeypatch.setattr(litellm, "callbacks", [pre])
|
|
caps = ProxyLogging._callback_capabilities()
|
|
assert caps.has_pre_call_override is True
|
|
assert pre in caps.resolved_callbacks
|