diff --git a/tests/llm_translation/realtime/test_openai_realtime.py b/tests/llm_translation/realtime/test_openai_realtime.py
index c5f77de6be..fc9f938b4c 100644
--- a/tests/llm_translation/realtime/test_openai_realtime.py
+++ b/tests/llm_translation/realtime/test_openai_realtime.py
@@ -101,7 +101,9 @@ async def test_openai_realtime_direct_call_no_intent():
 
     try:
         await litellm._arealtime(
-            model="openai/gpt-4o-realtime-preview",
+            # OpenAI shut down the gpt-4o-realtime-preview family (incl. the
+            # undated alias) on 2026-05-07; gpt-realtime is the GA successor.
+            model="openai/gpt-realtime",
             websocket=websocket_client,
             api_key=os.environ.get("OPENAI_API_KEY"),
             timeout=60,
@@ -249,14 +251,16 @@ async def test_openai_realtime_direct_call_with_intent():
     websocket_client = RealTimeWebSocketClient()
     caught_exception = None
 
+    # OpenAI shut down the gpt-4o-realtime-preview family (incl. the undated
+    # alias) on 2026-05-07; gpt-realtime is the GA successor.
     query_params: RealtimeQueryParams = {
-        "model": "openai/gpt-4o-realtime-preview",
+        "model": "openai/gpt-realtime",
         "intent": "chat",
     }
 
     try:
         await litellm._arealtime(
-            model="openai/gpt-4o-realtime-preview",
+            model="openai/gpt-realtime",
             websocket=websocket_client,
             api_key=os.environ.get("OPENAI_API_KEY"),
             query_params=query_params,
diff --git a/tests/llm_translation/realtime/test_openai_realtime_simple.py b/tests/llm_translation/realtime/test_openai_realtime_simple.py
index 5522d843e4..073c1ce11a 100644
--- a/tests/llm_translation/realtime/test_openai_realtime_simple.py
+++ b/tests/llm_translation/realtime/test_openai_realtime_simple.py
@@ -21,7 +21,10 @@ class TestOpenAIRealtime(BaseRealtimeTest):
     """
 
     def get_model(self) -> str:
-        return "gpt-4o-realtime-preview"
+        # OpenAI shut down the entire gpt-4o-realtime-preview family
+        # (including the undated alias) on 2026-05-07. gpt-realtime is the
+        # current GA realtime model.
+        return "gpt-realtime"
 
     def get_api_key_env_var(self) -> str:
         return "OPENAI_API_KEY"
diff --git a/tests/llm_translation/realtime/test_realtime_guardrails_openai.py b/tests/llm_translation/realtime/test_realtime_guardrails_openai.py
index 170440f6b9..ec9d73e2d6 100644
--- a/tests/llm_translation/realtime/test_realtime_guardrails_openai.py
+++ b/tests/llm_translation/realtime/test_realtime_guardrails_openai.py
@@ -26,9 +26,7 @@ from litellm.litellm_core_utils.realtime_streaming import RealTimeStreaming
 from litellm.types.guardrails import GuardrailEventHooks
 
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
-OPENAI_REALTIME_URL = (
-    "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17"
-)
+OPENAI_REALTIME_URL = "wss://api.openai.com/v1/realtime?model=gpt-realtime"
 
 pytestmark = pytest.mark.skipif(
     not OPENAI_API_KEY,
@@ -192,10 +190,35 @@ async def test_text_message_blocked_by_guardrail_no_ai_response():
             len(transcript_deltas) >= 1
         ), f"Expected guardrail message in transcript delta, got: {event_types}"
 
-        # 3. No *real* AI response should have been generated.
-        #    The guardrail may produce its own response (e.g. "Content blocked: ...")
-        #    via response.cancel + conversation.item.create + response.create.
-        #    We allow the guardrail's own block message but NOT original AI content.
+        # 3. No *real* AI response to the blocked content should have been
+        #    generated. The original user message is blocked BEFORE it is
+        #    forwarded to OpenAI, so the only thing the model ever sees is the
+        #    guardrail's "say exactly: <block message>" prompt
+        #    (see realtime_streaming.py). Two safe outcomes are possible:
+        #      - the model voices the block message verbatim (older realtime
+        #        snapshots did this -> text contains "blocked"), or
+        #      - the model declines to repeat it (gpt-realtime tends to refuse
+        #        verbatim-repeat instructions, e.g. "I'm sorry, but I can't
+        #        repeat that message.").
+        #    Both mean the blocked prompt itself was never answered, so we
+        #    accept either. The hard invariant is that the blocked phrase must
+        #    never leak into AI output, and the model must not have produced a
+        #    normal answer to the user (which would have neither a block nor a
+        #    refusal marker).
+        safe_markers = (
+            "block",
+            "guardrail",
+            "content filter",
+            "policy",
+            "can't repeat",
+            "cannot repeat",
+            "won't repeat",
+            "can't assist",
+            "can't help",
+            "unable to",
+            "i'm sorry",
+            "i am sorry",
+        )
         done_events = [e for e in client_events if e.get("type") == "response.done"]
         for done in done_events:
             output = done.get("response", {}).get("output", [])
@@ -205,11 +228,12 @@ async def test_text_message_blocked_by_guardrail_no_ai_response():
                 for c in item.get("content", [])
             ]
             real_ai_text = " ".join(ai_texts).strip()
-            # Allow guardrail-generated block messages (contain "Content blocked" or "blocked")
             if real_ai_text:
                 assert (
-                    "blocked" in real_ai_text.lower()
-                    or "guardrail" in real_ai_text.lower()
+                    BLOCKED_PHRASE not in real_ai_text
+                ), f"Blocked phrase leaked into AI response: {real_ai_text!r}"
+                assert any(
+                    marker in real_ai_text.lower() for marker in safe_markers
                 ), f"AI responded with non-guardrail content even though message was blocked: {real_ai_text!r}"
 
     finally:
diff --git a/tests/llm_translation/test_nvidia_nim.py b/tests/llm_translation/test_nvidia_nim.py
index 469516407c..80e764147b 100644
--- a/tests/llm_translation/test_nvidia_nim.py
+++ b/tests/llm_translation/test_nvidia_nim.py
@@ -262,3 +262,44 @@ class TestNvidiaNim(BaseLLMRerankTest):
     def get_expected_cost(self) -> float:
         """Nvidia NIM rerank models are free (cost = 0.0)"""
         return 0.0
+
+    @pytest.mark.asyncio()
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    async def test_basic_rerank(self, sync_mode, monkeypatch):
+        """
+        Override the base live rerank test with a mocked HTTP layer.
+
+        NVIDIA reached end-of-life for the hosted
+        nvidia/llama-3.2-nv-rerankqa-1b-v2 rerank API on 2026-05-18 and
+        published no replacement model, so a live call now returns HTTP 410
+        ("Gone"). NVIDIA's hosted catalog rotates on a schedule, so pointing
+        at another live model would only defer the same failure. Mock the
+        transport instead (same pattern as
+        test_nvidia_nim_rerank_ranking_endpoint above) so the request/response
+        transformation and cost calculation stay covered offline.
+        """
+        monkeypatch.setenv("NVIDIA_NIM_API_KEY", "fake-api-key")
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.headers = {}
+        mock_response.text = ""
+        mock_response.json.return_value = {
+            "rankings": [
+                {"index": 0, "logit": 0.95},
+                {"index": 1, "logit": 0.75},
+            ],
+            "usage": {"total_tokens": 7},
+        }
+
+        with (
+            patch(
+                "litellm.llms.custom_httpx.http_handler.HTTPHandler.post",
+                return_value=mock_response,
+            ),
+            patch(
+                "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
+                return_value=mock_response,
+            ),
+        ):
+            await super().test_basic_rerank(sync_mode=sync_mode)