diff --git a/tests/llm_translation/realtime/test_openai_realtime.py b/tests/llm_translation/realtime/test_openai_realtime.py index c5f77de6be..fc9f938b4c 100644 --- a/tests/llm_translation/realtime/test_openai_realtime.py +++ b/tests/llm_translation/realtime/test_openai_realtime.py @@ -101,7 +101,9 @@ async def test_openai_realtime_direct_call_no_intent(): try: await litellm._arealtime( - model="openai/gpt-4o-realtime-preview", + # OpenAI shut down the gpt-4o-realtime-preview family (incl. the + # undated alias) on 2026-05-07; gpt-realtime is the GA successor. + model="openai/gpt-realtime", websocket=websocket_client, api_key=os.environ.get("OPENAI_API_KEY"), timeout=60, @@ -249,14 +251,16 @@ async def test_openai_realtime_direct_call_with_intent(): websocket_client = RealTimeWebSocketClient() caught_exception = None + # OpenAI shut down the gpt-4o-realtime-preview family (incl. the undated + # alias) on 2026-05-07; gpt-realtime is the GA successor. query_params: RealtimeQueryParams = { - "model": "openai/gpt-4o-realtime-preview", + "model": "openai/gpt-realtime", "intent": "chat", } try: await litellm._arealtime( - model="openai/gpt-4o-realtime-preview", + model="openai/gpt-realtime", websocket=websocket_client, api_key=os.environ.get("OPENAI_API_KEY"), query_params=query_params, diff --git a/tests/llm_translation/realtime/test_openai_realtime_simple.py b/tests/llm_translation/realtime/test_openai_realtime_simple.py index 5522d843e4..073c1ce11a 100644 --- a/tests/llm_translation/realtime/test_openai_realtime_simple.py +++ b/tests/llm_translation/realtime/test_openai_realtime_simple.py @@ -21,7 +21,10 @@ class TestOpenAIRealtime(BaseRealtimeTest): """ def get_model(self) -> str: - return "gpt-4o-realtime-preview" + # OpenAI shut down the entire gpt-4o-realtime-preview family + # (including the undated alias) on 2026-05-07. gpt-realtime is the + # current GA realtime model. + return "gpt-realtime" def get_api_key_env_var(self) -> str: return "OPENAI_API_KEY" diff --git a/tests/llm_translation/realtime/test_realtime_guardrails_openai.py b/tests/llm_translation/realtime/test_realtime_guardrails_openai.py index 170440f6b9..ec9d73e2d6 100644 --- a/tests/llm_translation/realtime/test_realtime_guardrails_openai.py +++ b/tests/llm_translation/realtime/test_realtime_guardrails_openai.py @@ -26,9 +26,7 @@ from litellm.litellm_core_utils.realtime_streaming import RealTimeStreaming from litellm.types.guardrails import GuardrailEventHooks OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") -OPENAI_REALTIME_URL = ( - "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17" -) +OPENAI_REALTIME_URL = "wss://api.openai.com/v1/realtime?model=gpt-realtime" pytestmark = pytest.mark.skipif( not OPENAI_API_KEY, @@ -192,10 +190,35 @@ async def test_text_message_blocked_by_guardrail_no_ai_response(): len(transcript_deltas) >= 1 ), f"Expected guardrail message in transcript delta, got: {event_types}" - # 3. No *real* AI response should have been generated. - # The guardrail may produce its own response (e.g. "Content blocked: ...") - # via response.cancel + conversation.item.create + response.create. - # We allow the guardrail's own block message but NOT original AI content. + # 3. No *real* AI response to the blocked content should have been + # generated. The original user message is blocked BEFORE it is + # forwarded to OpenAI, so the only thing the model ever sees is the + # guardrail's "say exactly: " prompt + # (see realtime_streaming.py). Two safe outcomes are possible: + # - the model voices the block message verbatim (older realtime + # snapshots did this -> text contains "blocked"), or + # - the model declines to repeat it (gpt-realtime tends to refuse + # verbatim-repeat instructions, e.g. "I'm sorry, but I can't + # repeat that message."). + # Both mean the blocked prompt itself was never answered, so we + # accept either. The hard invariant is that the blocked phrase must + # never leak into AI output, and the model must not have produced a + # normal answer to the user (which would have neither a block nor a + # refusal marker). + safe_markers = ( + "block", + "guardrail", + "content filter", + "policy", + "can't repeat", + "cannot repeat", + "won't repeat", + "can't assist", + "can't help", + "unable to", + "i'm sorry", + "i am sorry", + ) done_events = [e for e in client_events if e.get("type") == "response.done"] for done in done_events: output = done.get("response", {}).get("output", []) @@ -205,11 +228,12 @@ async def test_text_message_blocked_by_guardrail_no_ai_response(): for c in item.get("content", []) ] real_ai_text = " ".join(ai_texts).strip() - # Allow guardrail-generated block messages (contain "Content blocked" or "blocked") if real_ai_text: assert ( - "blocked" in real_ai_text.lower() - or "guardrail" in real_ai_text.lower() + BLOCKED_PHRASE not in real_ai_text + ), f"Blocked phrase leaked into AI response: {real_ai_text!r}" + assert any( + marker in real_ai_text.lower() for marker in safe_markers ), f"AI responded with non-guardrail content even though message was blocked: {real_ai_text!r}" finally: diff --git a/tests/llm_translation/test_nvidia_nim.py b/tests/llm_translation/test_nvidia_nim.py index 469516407c..80e764147b 100644 --- a/tests/llm_translation/test_nvidia_nim.py +++ b/tests/llm_translation/test_nvidia_nim.py @@ -262,3 +262,44 @@ class TestNvidiaNim(BaseLLMRerankTest): def get_expected_cost(self) -> float: """Nvidia NIM rerank models are free (cost = 0.0)""" return 0.0 + + @pytest.mark.asyncio() + @pytest.mark.parametrize("sync_mode", [True, False]) + async def test_basic_rerank(self, sync_mode, monkeypatch): + """ + Override the base live rerank test with a mocked HTTP layer. + + NVIDIA reached end-of-life for the hosted + nvidia/llama-3.2-nv-rerankqa-1b-v2 rerank API on 2026-05-18 and + published no replacement model, so a live call now returns HTTP 410 + ("Gone"). NVIDIA's hosted catalog rotates on a schedule, so pointing + at another live model would only defer the same failure. Mock the + transport instead (same pattern as + test_nvidia_nim_rerank_ranking_endpoint above) so the request/response + transformation and cost calculation stay covered offline. + """ + monkeypatch.setenv("NVIDIA_NIM_API_KEY", "fake-api-key") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {} + mock_response.text = "" + mock_response.json.return_value = { + "rankings": [ + {"index": 0, "logit": 0.95}, + {"index": 1, "logit": 0.75}, + ], + "usage": {"total_tokens": 7}, + } + + with ( + patch( + "litellm.llms.custom_httpx.http_handler.HTTPHandler.post", + return_value=mock_response, + ), + patch( + "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", + return_value=mock_response, + ), + ): + await super().test_basic_rerank(sync_mode=sync_mode)