Files
litellm/tests/proxy_unit_tests/test_realtime_cache.py
T
Alexsander Hamir eb5031da1e [Perf] Fix bottlenecks degrading realtime endpoint performance (#16670)
* Cache realtime websocket request body

Move the realtime request payload builder out of the websocket handler and wrap it with an LRU cache so repeated connections reuse the same bytes object. This keeps the JSON formatting cost down while bounding memory usage.

* Optimize realtime websocket caching

Refactored /v1/realtime to use cached helpers for both the JSON body and query params, introduced a reusable request-scope template, and optimized header handling to avoid redundant work.

* Refine realtime websocket header handling

* Reuse websocket scope headers in auth

* Refactor realtime request body helper

Move the realtime request body formatter into proxy common utils so it can be reused across modules. Reuse it in the websocket auth flow to share LRU caching and avoid ad hoc byte builders.

* fix: revert to old pattern

The old pattern was necessary, we can just return the optimized function instead.

* Reuse SSL context for realtime

Create a shared SSLContext for OpenAI realtime websocket dials and pass it into websockets.connect so we stop re-reading verify paths on every session.

* feat: reuse shared TLS context for realtime websockets

- add `SHARED_REALTIME_SSL_CONTEXT` helper so all realtime websocket clients share the same TLS settings
- wire the shared context into OpenAI, Azure, custom HTTPX handlers, and realtime health checks
- update realtime tests to assert that the expected SSL context is passed to `websockets.connect`

This keeps TLS configuration consistent and avoids recreating SSL contexts per connection.

* Reuse HTTP SSL context for realtime

Remove the standalone realtime SSL helper, expose a shared context directly from the HTTP handler, and point all realtime websocket clients and tests to it. Add the websocket header comparison tool.

* Lazy-load shared realtime SSL context

Fix circular imports introduced by eagerly instantiating the shared TLS context. Make the HTTP handler lazily create the context and have realtime clients/tests fetch it on demand, keeping configuration consistent without breaking startup.

* add: unit test for realtime LRU caches

* fix: merge conflict with imports
2025-11-22 10:01:02 -08:00

63 lines
2.3 KiB
Python

from typing import Any, cast
import pytest
from litellm.proxy.common_utils.realtime_utils import _realtime_request_body
from litellm.proxy.proxy_server import _realtime_query_params_template
@pytest.fixture(autouse=True)
def clear_realtime_caches():
_realtime_request_body.cache_clear()
_realtime_query_params_template.cache_clear()
yield
_realtime_request_body.cache_clear()
_realtime_query_params_template.cache_clear()
def test_realtime_request_body_returns_immutable_bytes():
cached_body = _realtime_request_body("gpt-4o")
with pytest.raises(TypeError):
cast(Any, cached_body)[0] = ord("x")
def test_realtime_query_params_template_returns_immutable_tuples():
cached_tuple = _realtime_query_params_template("gpt-4o", "intent-a")
with pytest.raises(TypeError):
cast(Any, cached_tuple)[0] = ("model", "mutated")
def test_realtime_request_body_caches_each_model_separately():
gpt4o_body_first = _realtime_request_body("gpt-4o")
gpt4o_body_second = _realtime_request_body("gpt-4o")
gpt4o_mini_body = _realtime_request_body("gpt-4o-mini")
assert gpt4o_body_first is gpt4o_body_second
assert gpt4o_body_first == b'{"model": "gpt-4o"}'
assert gpt4o_mini_body == b'{"model": "gpt-4o-mini"}'
assert gpt4o_body_first is not gpt4o_mini_body
def test_realtime_query_params_template_caches_each_pair_separately():
params_with_intent_first = _realtime_query_params_template("gpt-4o", "intent-a")
params_with_intent_second = _realtime_query_params_template("gpt-4o", "intent-a")
params_without_intent = _realtime_query_params_template("gpt-4o", None)
assert params_with_intent_first is params_with_intent_second
assert params_with_intent_first == (("model", "gpt-4o"), ("intent", "intent-a"))
assert params_without_intent == (("model", "gpt-4o"),)
assert params_with_intent_first is not params_without_intent
def test_realtime_query_params_dict_copies_do_not_leak_state():
params_dict_one = dict(_realtime_query_params_template("gpt-4o", "intent-a"))
params_dict_one["new"] = "value"
params_dict_two = dict(_realtime_query_params_template("gpt-4o", "intent-a"))
assert "new" not in params_dict_two
assert params_dict_two == {"model": "gpt-4o", "intent": "intent-a"}