mirror of
https://github.com/tiennm99/litellm.git
synced 2026-06-17 10:47:56 +00:00
fix(tests): migrate realtime + rerank tests off shut-down upstream models (#28191)
* fix(tests): use gpt-realtime in realtime guardrails test
OpenAI shut down gpt-4o-realtime-preview-2024-12-17 on 2026-05-07, so
the live OpenAI realtime guardrails integration test now fails with
model_not_found (session.created never arrives, _wait_for_event times
out). Point OPENAI_REALTIME_URL at the current GA model, gpt-realtime.
Scope limited to this test: the pricing-catalog JSON keeps the retired
entries intentionally (historical cost calc + separate Azure timeline),
and the Azure realtime cost-calc test is unaffected.
* fix(tests): mock nvidia_nim rerank instead of hitting EOL'd endpoint
NVIDIA reached end-of-life for the hosted nvidia/llama-3.2-nv-rerankqa-1b-v2
rerank API on 2026-05-18 with no published replacement, so the live
BaseLLMRerankTest.test_basic_rerank for nvidia_nim now returns HTTP 410
("Gone"). NVIDIA's hosted catalog rotates on a schedule, so swapping in
another live model would only defer the failure.
Override test_basic_rerank in TestNvidiaNim to mock the sync/async HTTP
transport (same pattern as test_nvidia_nim_rerank_ranking_endpoint in this
file) and inject a fake NVIDIA_NIM_API_KEY via monkeypatch. The
request/response transformation and cost calculation stay covered offline.
Scope limited to nvidia_nim; other BaseLLMRerankTest providers untouched.
* fix(tests): migrate remaining realtime tests off shut-down gpt-4o-realtime-preview
OpenAI's 2026-05-07 shutdown removed the entire gpt-4o-realtime-preview
family, including the undated 'gpt-4o-realtime-preview' alias (not just the
dated snapshot fixed earlier). Three live tests still connected with the
dead alias and failed with messages_received=1 (an error event instead of
session.created):
- test_openai_realtime_simple.py: get_model() -> gpt-realtime (drives
TestOpenAIRealtime.test_realtime_connection / test_realtime_with_query_params)
- test_openai_realtime.py: test_openai_realtime_direct_call_no_intent and
test_openai_realtime_direct_call_with_intent -> openai/gpt-realtime
(the with_intent test shares the same dead alias even though it was not
in the failing set this run)
Mocked unit tests (test_realtime_query_params_construction,
test_realtime_query_params_use_normalized_model_name) are left as-is: they
never hit the network and assert string plumbing only.
Also fixes test_text_message_blocked_by_guardrail_no_ai_response, which now
connects (the earlier URL swap worked) but tripped a model-wording-brittle
assertion. The guardrail flow asks the model to voice the block message
verbatim; gpt-4o-realtime-preview complied (output contained 'blocked'),
gpt-realtime refuses verbatim-repeat instructions ('I'm sorry, but I can't
repeat that message.'). Since the original user message is blocked before
it reaches OpenAI, the refusal is still a safe outcome. Assertion #3 now
accepts both voicing and refusal, and adds a hard check that the blocked
phrase never leaks into AI output.
This commit is contained in:
@@ -101,7 +101,9 @@ async def test_openai_realtime_direct_call_no_intent():
|
||||
|
||||
try:
|
||||
await litellm._arealtime(
|
||||
model="openai/gpt-4o-realtime-preview",
|
||||
# OpenAI shut down the gpt-4o-realtime-preview family (incl. the
|
||||
# undated alias) on 2026-05-07; gpt-realtime is the GA successor.
|
||||
model="openai/gpt-realtime",
|
||||
websocket=websocket_client,
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
timeout=60,
|
||||
@@ -249,14 +251,16 @@ async def test_openai_realtime_direct_call_with_intent():
|
||||
websocket_client = RealTimeWebSocketClient()
|
||||
caught_exception = None
|
||||
|
||||
# OpenAI shut down the gpt-4o-realtime-preview family (incl. the undated
|
||||
# alias) on 2026-05-07; gpt-realtime is the GA successor.
|
||||
query_params: RealtimeQueryParams = {
|
||||
"model": "openai/gpt-4o-realtime-preview",
|
||||
"model": "openai/gpt-realtime",
|
||||
"intent": "chat",
|
||||
}
|
||||
|
||||
try:
|
||||
await litellm._arealtime(
|
||||
model="openai/gpt-4o-realtime-preview",
|
||||
model="openai/gpt-realtime",
|
||||
websocket=websocket_client,
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
query_params=query_params,
|
||||
|
||||
@@ -21,7 +21,10 @@ class TestOpenAIRealtime(BaseRealtimeTest):
|
||||
"""
|
||||
|
||||
def get_model(self) -> str:
|
||||
return "gpt-4o-realtime-preview"
|
||||
# OpenAI shut down the entire gpt-4o-realtime-preview family
|
||||
# (including the undated alias) on 2026-05-07. gpt-realtime is the
|
||||
# current GA realtime model.
|
||||
return "gpt-realtime"
|
||||
|
||||
def get_api_key_env_var(self) -> str:
|
||||
return "OPENAI_API_KEY"
|
||||
|
||||
@@ -26,9 +26,7 @@ from litellm.litellm_core_utils.realtime_streaming import RealTimeStreaming
|
||||
from litellm.types.guardrails import GuardrailEventHooks
|
||||
|
||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
||||
OPENAI_REALTIME_URL = (
|
||||
"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17"
|
||||
)
|
||||
OPENAI_REALTIME_URL = "wss://api.openai.com/v1/realtime?model=gpt-realtime"
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not OPENAI_API_KEY,
|
||||
@@ -192,10 +190,35 @@ async def test_text_message_blocked_by_guardrail_no_ai_response():
|
||||
len(transcript_deltas) >= 1
|
||||
), f"Expected guardrail message in transcript delta, got: {event_types}"
|
||||
|
||||
# 3. No *real* AI response should have been generated.
|
||||
# The guardrail may produce its own response (e.g. "Content blocked: ...")
|
||||
# via response.cancel + conversation.item.create + response.create.
|
||||
# We allow the guardrail's own block message but NOT original AI content.
|
||||
# 3. No *real* AI response to the blocked content should have been
|
||||
# generated. The original user message is blocked BEFORE it is
|
||||
# forwarded to OpenAI, so the only thing the model ever sees is the
|
||||
# guardrail's "say exactly: <block message>" prompt
|
||||
# (see realtime_streaming.py). Two safe outcomes are possible:
|
||||
# - the model voices the block message verbatim (older realtime
|
||||
# snapshots did this -> text contains "blocked"), or
|
||||
# - the model declines to repeat it (gpt-realtime tends to refuse
|
||||
# verbatim-repeat instructions, e.g. "I'm sorry, but I can't
|
||||
# repeat that message.").
|
||||
# Both mean the blocked prompt itself was never answered, so we
|
||||
# accept either. The hard invariant is that the blocked phrase must
|
||||
# never leak into AI output, and the model must not have produced a
|
||||
# normal answer to the user (which would have neither a block nor a
|
||||
# refusal marker).
|
||||
safe_markers = (
|
||||
"block",
|
||||
"guardrail",
|
||||
"content filter",
|
||||
"policy",
|
||||
"can't repeat",
|
||||
"cannot repeat",
|
||||
"won't repeat",
|
||||
"can't assist",
|
||||
"can't help",
|
||||
"unable to",
|
||||
"i'm sorry",
|
||||
"i am sorry",
|
||||
)
|
||||
done_events = [e for e in client_events if e.get("type") == "response.done"]
|
||||
for done in done_events:
|
||||
output = done.get("response", {}).get("output", [])
|
||||
@@ -205,11 +228,12 @@ async def test_text_message_blocked_by_guardrail_no_ai_response():
|
||||
for c in item.get("content", [])
|
||||
]
|
||||
real_ai_text = " ".join(ai_texts).strip()
|
||||
# Allow guardrail-generated block messages (contain "Content blocked" or "blocked")
|
||||
if real_ai_text:
|
||||
assert (
|
||||
"blocked" in real_ai_text.lower()
|
||||
or "guardrail" in real_ai_text.lower()
|
||||
BLOCKED_PHRASE not in real_ai_text
|
||||
), f"Blocked phrase leaked into AI response: {real_ai_text!r}"
|
||||
assert any(
|
||||
marker in real_ai_text.lower() for marker in safe_markers
|
||||
), f"AI responded with non-guardrail content even though message was blocked: {real_ai_text!r}"
|
||||
|
||||
finally:
|
||||
|
||||
@@ -262,3 +262,44 @@ class TestNvidiaNim(BaseLLMRerankTest):
|
||||
def get_expected_cost(self) -> float:
|
||||
"""Nvidia NIM rerank models are free (cost = 0.0)"""
|
||||
return 0.0
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
async def test_basic_rerank(self, sync_mode, monkeypatch):
|
||||
"""
|
||||
Override the base live rerank test with a mocked HTTP layer.
|
||||
|
||||
NVIDIA reached end-of-life for the hosted
|
||||
nvidia/llama-3.2-nv-rerankqa-1b-v2 rerank API on 2026-05-18 and
|
||||
published no replacement model, so a live call now returns HTTP 410
|
||||
("Gone"). NVIDIA's hosted catalog rotates on a schedule, so pointing
|
||||
at another live model would only defer the same failure. Mock the
|
||||
transport instead (same pattern as
|
||||
test_nvidia_nim_rerank_ranking_endpoint above) so the request/response
|
||||
transformation and cost calculation stay covered offline.
|
||||
"""
|
||||
monkeypatch.setenv("NVIDIA_NIM_API_KEY", "fake-api-key")
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {}
|
||||
mock_response.text = ""
|
||||
mock_response.json.return_value = {
|
||||
"rankings": [
|
||||
{"index": 0, "logit": 0.95},
|
||||
{"index": 1, "logit": 0.75},
|
||||
],
|
||||
"usage": {"total_tokens": 7},
|
||||
}
|
||||
|
||||
with (
|
||||
patch(
|
||||
"litellm.llms.custom_httpx.http_handler.HTTPHandler.post",
|
||||
return_value=mock_response,
|
||||
),
|
||||
patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
return_value=mock_response,
|
||||
),
|
||||
):
|
||||
await super().test_basic_rerank(sync_mode=sync_mode)
|
||||
|
||||
Reference in New Issue
Block a user