Files
litellm/tests/test_litellm/caching/test_llm_client_cache_e2e.py
T
Ishaan Jaff 503eb2fd4c fix: don't close HTTP/SDK clients on LLMClientCache eviction (#22925)
* fix: don't close HTTP/SDK clients on LLMClientCache eviction

Removing the _remove_key override that eagerly called aclose()/close()
on evicted clients. Evicted clients may still be held by in-flight
streaming requests; closing them causes:

  RuntimeError: Cannot send a request, as the client has been closed.

This is a regression from commit fb72979432. Clients that are no longer
referenced will be garbage-collected naturally. Explicit shutdown cleanup
happens via close_litellm_async_clients().

Fixes production crashes after the 1-hour cache TTL expires.

* test: update LLMClientCache unit tests for no-close-on-eviction behavior

Flip the assertions: evicted clients must NOT be closed. Replace
test_remove_key_closes_async_client → test_remove_key_does_not_close_async_client
and equivalents for sync/eviction paths.

Add test_remove_key_removes_plain_values for non-client cache entries.
Remove test_background_tasks_cleaned_up_after_completion (no more _background_tasks).
Remove test_remove_key_no_event_loop variant that depended on old behavior.

* test: add e2e tests for OpenAI SDK client surviving cache eviction

Add two new e2e tests using real AsyncOpenAI clients:
- test_evicted_openai_sdk_client_stays_usable: verifies size-based eviction
  doesn't close the client
- test_ttl_expired_openai_sdk_client_stays_usable: verifies TTL expiry
  eviction doesn't close the client

Both tests sleep after eviction so any create_task()-based close would
have time to run, making the regression detectable.

Also expand the module docstring to explain why the sleep is required.

* docs(AGENTS.md): add rule — never close HTTP/SDK clients on cache eviction

* docs(CLAUDE.md): add HTTP client cache safety guideline
2026-03-05 12:00:38 -08:00

128 lines
4.7 KiB
Python

"""e2e tests: httpx clients obtained via get_async_httpx_client must remain
usable after LLMClientCache evicts their cache entry.
These tests exist to prevent a recurring production crash:
RuntimeError: Cannot send a request, as the client has been closed.
The bug occurs when LLMClientCache._remove_key() eagerly closes evicted
clients that are still referenced by in-flight requests. Every test here
sleeps after eviction to let the event loop drain any background close
tasks — a plain ``assert not client.is_closed`` without sleeping is NOT
sufficient to catch the regression (the close task runs asynchronously).
See: https://github.com/BerriAI/litellm/pull/22247
"""
import asyncio
import pytest
import litellm
from litellm.caching.llm_caching_handler import LLMClientCache
from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
@pytest.fixture(autouse=True)
def _tiny_client_cache(monkeypatch):
"""Replace the global client cache with a size-1 cache so eviction
triggers on the second insert."""
cache = LLMClientCache(max_size_in_memory=1, default_ttl=600)
monkeypatch.setattr(litellm, "in_memory_llm_clients_cache", cache)
yield cache
@pytest.mark.asyncio
async def test_evicted_client_is_not_closed():
"""Get a client via get_async_httpx_client, evict it by caching a second
one, then verify the first client's transport is still open."""
client_a = get_async_httpx_client(llm_provider="provider_a")
# This evicts client_a from cache (capacity=1)
client_b = get_async_httpx_client(llm_provider="provider_b")
# Sleep to let any background close tasks execute — without this sleep,
# a regression that schedules close via create_task() would go undetected.
await asyncio.sleep(0.15)
assert not client_a.client.is_closed
await client_a.client.aclose()
await client_b.client.aclose()
@pytest.mark.asyncio
async def test_expired_client_is_not_closed():
"""Get a client, expire it via TTL, then verify the client is still open."""
cache = litellm.in_memory_llm_clients_cache
client = get_async_httpx_client(llm_provider="provider_ttl")
# Force the entry to expire and trigger eviction
for key in list(cache.ttl_dict.keys()):
cache.ttl_dict[key] = 0
# Also fix the heap entry so evict_cache finds it
cache.expiration_heap = [(0, key) for _, key in cache.expiration_heap]
cache.evict_cache()
await asyncio.sleep(0.15)
assert not client.client.is_closed
await client.client.aclose()
@pytest.mark.asyncio
async def test_evicted_openai_sdk_client_stays_usable():
"""OpenAI/Azure SDK clients cached in LLMClientCache must remain usable
after eviction. This is the exact production scenario: the proxy caches
an AsyncOpenAI client, the TTL expires, a new request evicts the old
entry, but a concurrent streaming request is still reading from it.
Regression guard: if _remove_key ever calls client.close(), the
underlying httpx client is closed and this test fails.
"""
from openai import AsyncOpenAI
cache = litellm.in_memory_llm_clients_cache
client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
cache.set_cache("openai-client", client, ttl=600)
# Evict by inserting a second entry (max_size=1)
cache.set_cache("filler", "x", ttl=600)
# Let the event loop drain any background close tasks
await asyncio.sleep(0.15)
# The SDK client's internal httpx client must still be open
assert not client._client.is_closed, (
"AsyncOpenAI client was closed on cache eviction — this causes "
"'Cannot send a request, as the client has been closed' in production"
)
await client.close()
@pytest.mark.asyncio
async def test_ttl_expired_openai_sdk_client_stays_usable():
"""Same as above but triggered via TTL expiry + get_cache (the other
eviction path)."""
from openai import AsyncOpenAI
cache = litellm.in_memory_llm_clients_cache
client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
cache.set_cache("openai-client", client, ttl=600)
# Force TTL expiry
for key in list(cache.ttl_dict.keys()):
cache.ttl_dict[key] = 0
cache.expiration_heap = [(0, k) for _, k in cache.expiration_heap]
# get_cache calls evict_element_if_expired → _remove_key
result = cache.get_cache("openai-client")
assert result is None # expired, so returns None
await asyncio.sleep(0.15)
assert not client._client.is_closed, (
"AsyncOpenAI client was closed on TTL expiry — this causes "
"'Cannot send a request, as the client has been closed' in production"
)
await client.close()