From ae414ed4627cf32a3cdfb15ef0eb42779b219ede Mon Sep 17 00:00:00 2001
From: Sameer Kankute <sameer@berri.ai>
Date: Tue, 20 Jan 2026 17:07:00 +0530
Subject: [PATCH] =?UTF-8?q?Revert=20"feat:=20add=20retry=5Fdelay,=20expone?=
 =?UTF-8?q?ntial=5Fbackoff,=20and=20jitter=20to=20completion(=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 1678f621db8305c7e5f9366f5ee46c7f4edb9a47.
---
 docs/my-website/docs/completion/input.md      |   6 -
 .../docs/completion/reliable_completions.md   |  30 ---
 .../litellm_core_utils/get_litellm_params.py  |   6 -
 litellm/main.py                               | 181 +++---------------
 litellm/types/utils.py                        |   8 +-
 5 files changed, 27 insertions(+), 204 deletions(-)

diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md
index 1272c7eb16..2f6da4bedc 100644
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@@ -264,12 +264,6 @@ messages=[{"role": "user", "content": [
 
 - `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs 
 
-- `retry_delay`: *float (optional)* - Time in seconds to wait between retries.
-
-- `exponential_backoff`: *float (optional)* - If true, wait time doubles after each failure (1s, 2s, 4s...)
-
-- `jitter`: *float (optional)* - If true, adds randomness to the wait time to prevent thundering herd. 
-
 - `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error
 
 - `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails
diff --git a/docs/my-website/docs/completion/reliable_completions.md b/docs/my-website/docs/completion/reliable_completions.md
index 431df66c9d..f38917fe53 100644
--- a/docs/my-website/docs/completion/reliable_completions.md
+++ b/docs/my-website/docs/completion/reliable_completions.md
@@ -31,36 +31,6 @@ response = completion(
         )
 ```
 
-### Configurable Retries (Exponential Backoff, Jitter)
-
-You can also specify the `retry_delay` and `exponential_backoff` or `jitter` to the completion call.
-
-* `retry_delay`: (float) Time in seconds to wait between retries.
-* `exponential_backoff`: (bool) If true, wait time doubles after each failure (1s, 2s, 4s...).
-* `jitter`: (bool) If true, adds randomness to the wait time to prevent thundering herd.
-
-```python
-import litellm
-
-# Exponential Backoff
-response = litellm.completion(
-    model="gpt-3.5-turbo",
-    messages=[{"role": "user", "content": "Hi"}],
-    num_retries=3,
-    retry_delay=1.0,        # Start with 1s wait
-    exponential_backoff=True # Wait 1s, 2s, 4s...
-)
-
-# Jitter
-response = litellm.completion(
-    model="gpt-3.5-turbo",
-    messages=[{"role": "user", "content": "Hi"}],
-    num_retries=3,
-    retry_delay=1.0,
-    jitter=True             # Randomize wait times
-)
-```
-
 ## Fallbacks (SDK)
 
 :::info
diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py
index e9740ed1da..0d35cfa314 100644
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@@ -64,9 +64,6 @@ def get_litellm_params(
     api_version: Optional[str] = None,
     max_retries: Optional[int] = None,
     litellm_request_debug: Optional[bool] = None,
-    retry_delay: Optional[float] = None,
-    exponential_backoff: Optional[bool] = None,
-    jitter: Optional[bool] = None,
     **kwargs,
 ) -> dict:
     litellm_params = {
@@ -119,9 +116,6 @@ def get_litellm_params(
         "azure_password": kwargs.get("azure_password"),
         "azure_scope": kwargs.get("azure_scope"),
         "max_retries": max_retries,
-        "retry_delay": retry_delay,
-        "exponential_backoff": exponential_backoff,
-        "jitter": jitter,
         "timeout": kwargs.get("timeout"),
         "bucket_name": kwargs.get("bucket_name"),
         "vertex_credentials": kwargs.get("vertex_credentials"),
diff --git a/litellm/main.py b/litellm/main.py
index e62484e40f..e199aa3001 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -415,11 +415,6 @@ async def acompletion(
     web_search_options: Optional[OpenAIWebSearchOptions] = None,
     # Session management
     shared_session: Optional["ClientSession"] = None,
-    # Retry params
-    retry_delay: Optional[float] = None,
-    exponential_backoff: Optional[bool] = None,
-    jitter: Optional[bool] = None,
-    num_retries: Optional[int] = None,
     **kwargs,
 ) -> Union[ModelResponse, CustomStreamWrapper]:
     """
@@ -465,16 +460,6 @@ async def acompletion(
         - The `completion` function is called using `run_in_executor` to execute synchronously in the event loop.
         - If `stream` is True, the function returns an async generator that yields completion lines.
     """
-    if _update_kwargs_with_retry_params(
-        retry_delay,
-        exponential_backoff,
-        jitter,
-        num_retries,
-        locals().get("max_retries"),
-        kwargs,
-    ):
-        return await acompletion_with_retries(model=model, messages=messages, **kwargs)
-
     fallbacks = kwargs.get("fallbacks", None)
     mock_timeout = kwargs.get("mock_timeout", None)
 
@@ -1007,32 +992,8 @@ def _drop_input_examples_from_tools(
     return cleaned_tools
 
 
-def _update_kwargs_with_retry_params(
-    retry_delay: Optional[float],
-    exponential_backoff: Optional[bool],
-    jitter: Optional[bool],
-    num_retries: Optional[int],
-    max_retries: Optional[int],
-    kwargs: dict,
-) -> bool:
-    """
-    Updates kwargs with retry parameters if any are provided.
-    Returns True if retry logic should be triggered, False otherwise.
-    """
-    if retry_delay is not None or exponential_backoff is not None or jitter is not None:
-        kwargs["retry_delay"] = retry_delay
-        kwargs["exponential_backoff"] = exponential_backoff
-        kwargs["jitter"] = jitter
-
-        if num_retries is not None:
-            kwargs["num_retries"] = num_retries
-        elif max_retries is not None and "num_retries" not in kwargs:
-            kwargs["num_retries"] = max_retries
-
-        return True
-    return False
-
-
+@tracer.wrap()
+@client
 def completion(  # type: ignore # noqa: PLR0915
     model: str,
     # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
@@ -1082,11 +1043,6 @@ def completion(  # type: ignore # noqa: PLR0915
     thinking: Optional[AnthropicThinkingParam] = None,
     # Session management
     shared_session: Optional["ClientSession"] = None,
-    # Retry params
-    retry_delay: Optional[float] = None,
-    exponential_backoff: Optional[bool] = None,
-    jitter: Optional[bool] = None,
-    num_retries: Optional[int] = None,
     **kwargs,
 ) -> Union[ModelResponse, CustomStreamWrapper]:
     """
@@ -1134,20 +1090,6 @@ def completion(  # type: ignore # noqa: PLR0915
         - It supports various optional parameters for customizing the completion behavior.
         - If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
     """
-    if _update_kwargs_with_retry_params(
-        retry_delay,
-        exponential_backoff,
-        jitter,
-        num_retries,
-        locals().get("max_retries"),
-        kwargs,
-    ):
-        # check if this is an async call (acompletion=True in kwargs)
-        if kwargs.get("acompletion", False) is True:
-            return acompletion_with_retries(model=model, messages=messages, **kwargs)
-
-        return completion_with_retries(model=model, messages=messages, **kwargs)
-
     ### VALIDATE Request ###
     if model is None:
         raise ValueError("model param not passed in.")
@@ -1173,9 +1115,7 @@ def completion(  # type: ignore # noqa: PLR0915
         # Check if MCP tools are present (following responses pattern)
         # Cast tools to Optional[Iterable[ToolParam]] for type checking
         tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools)
-        if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(
-            tools=tools_for_mcp
-        ):
+        if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp):
             # Return coroutine - acompletion will await it
             # completion() can return a coroutine when MCP tools are present, which acompletion() awaits
             return acompletion_with_mcp(  # type: ignore[return-value]
@@ -2405,7 +2345,11 @@ def completion(  # type: ignore # noqa: PLR0915
                 input=messages, api_key=api_key, original_response=response
             )
         elif custom_llm_provider == "minimax":
-            api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key
+            api_key = (
+                api_key
+                or get_secret_str("MINIMAX_API_KEY")
+                or litellm.api_key
+            )
 
             api_base = (
                 api_base
@@ -2453,9 +2397,7 @@ def completion(  # type: ignore # noqa: PLR0915
             or custom_llm_provider == "wandb"
             or custom_llm_provider == "clarifai"
             or custom_llm_provider in litellm.openai_compatible_providers
-            or JSONProviderRegistry.exists(
-                custom_llm_provider
-            )  # JSON-configured providers
+            or JSONProviderRegistry.exists(custom_llm_provider)  # JSON-configured providers
             or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
         ):  # allow user to make an openai call with a custom base
             # note: if a user sets a custom base - we should ensure this works
@@ -4295,51 +4237,24 @@ def completion_with_retries(*args, **kwargs):
     retry_strategy: Literal["exponential_backoff_retry", "constant_retry"] = kwargs.pop(
         "retry_strategy", "constant_retry"
     )  # type: ignore
-    retry_delay = kwargs.pop("retry_delay", None)
-    exponential_backoff = kwargs.pop("exponential_backoff", False)
-    jitter = kwargs.pop("jitter", False)
-
     original_function = kwargs.pop("original_function", completion)
-
-    # +1 because stop_after_attempt includes the initial attempt
-    stop_after = tenacity.stop_after_attempt(num_retries + 1)
-
-    if retry_strategy == "exponential_backoff_retry" or exponential_backoff:
-        # Defaults for exponential backoff
-        multiplier = 1
-        min_wait = 0
-        if retry_delay is not None:
-            multiplier = retry_delay
-            min_wait = retry_delay
-
-        wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10}
-
-        if jitter:
-            wait_strategy = tenacity.wait_random_exponential(**wait_args)
-        else:
-            wait_strategy = tenacity.wait_exponential(**wait_args)
-
+    if retry_strategy == "exponential_backoff_retry":
         retryer = tenacity.Retrying(
-            wait=wait_strategy,
-            stop=stop_after,
+            wait=tenacity.wait_exponential(multiplier=1, max=10),
+            stop=tenacity.stop_after_attempt(num_retries),
             reraise=True,
         )
     else:
-        wait_strategy = tenacity.wait_none()
-        if retry_delay:
-            wait_strategy = tenacity.wait_fixed(retry_delay)
-
         retryer = tenacity.Retrying(
-            wait=wait_strategy,
-            stop=stop_after,
-            reraise=True,
+            stop=tenacity.stop_after_attempt(num_retries), reraise=True
         )
     return retryer(original_function, *args, **kwargs)
 
 
 async def acompletion_with_retries(*args, **kwargs):
     """
-    Executes a litellm.completion() with retries.
+    [DEPRECATED]. Use 'acompletion' or router.acompletion instead!
+    Executes a litellm.completion() with 3 retries
     """
     try:
         import tenacity
@@ -4352,65 +4267,18 @@ async def acompletion_with_retries(*args, **kwargs):
     kwargs["max_retries"] = 0
     kwargs["num_retries"] = 0
     retry_strategy = kwargs.pop("retry_strategy", "constant_retry")
-    retry_delay = kwargs.pop("retry_delay", None)
-    exponential_backoff = kwargs.pop("exponential_backoff", False)
-    jitter = kwargs.pop("jitter", False)
     original_function = kwargs.pop("original_function", completion)
-
-    # +1 because stop_after_attempt includes the initial attempt
-    stop_after = tenacity.stop_after_attempt(num_retries + 1)
-
-    # If the original function is completion but we are doing async retries
-    # we need to ensure it's treated as an async function if it returns a coroutine
-    # or wraps it.
-    # However, since we are in acompletion_with_retries, we expect to be waiting.
-    # If original_function is completion(acompletion=True), it returns a coro.
-    # Tenacity AsyncRetrying expects the function to be awaitable or return awaitable?
-    # Actually AsyncRetrying works with async def functions.
-    # If original_function is sync (but returns coro), we might need a wrapper.
-
-    async def _async_original_function(*args, **kwargs):
-        # Ensure we await the result if it is a coroutine
-        result = original_function(*args, **kwargs)
-        if asyncio.iscoroutine(result):
-            return await result
-        return result
-
-    if retry_strategy == "exponential_backoff_retry" or exponential_backoff:
-        # Defaults for exponential backoff
-        multiplier = 1
-        min_wait = 0
-        if retry_delay is not None:
-            multiplier = retry_delay
-            min_wait = retry_delay
-
-        wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10}
-
-        if jitter:
-            # Use wait_random_exponential if available or combine
-            # Using wait_exponential + wait_random is one way, or wait_random_exponential
-            # tenacity.wait_random_exponential(multiplier=1, max=10)
-            wait_strategy = tenacity.wait_random_exponential(**wait_args)
-        else:
-            wait_strategy = tenacity.wait_exponential(**wait_args)
-
+    if retry_strategy == "exponential_backoff_retry":
         retryer = tenacity.AsyncRetrying(
-            wait=wait_strategy,
-            stop=stop_after,
+            wait=tenacity.wait_exponential(multiplier=1, max=10),
+            stop=tenacity.stop_after_attempt(num_retries),
             reraise=True,
         )
     else:
-        wait_strategy = tenacity.wait_none()
-        if retry_delay:
-            wait_strategy = tenacity.wait_fixed(retry_delay)
-
         retryer = tenacity.AsyncRetrying(
-            wait=wait_strategy,
-            stop=stop_after,
-            reraise=True,
+            stop=tenacity.stop_after_attempt(num_retries), reraise=True
         )
-
-    return await retryer(_async_original_function, *args, **kwargs)
+    return await retryer(original_function, *args, **kwargs)
 
 
 def responses_with_retries(*args, **kwargs):
@@ -4848,7 +4716,7 @@ def embedding(  # noqa: PLR0915
 
             if headers is not None and headers != {}:
                 optional_params["extra_headers"] = headers
-
+            
             if encoding_format is not None:
                 optional_params["encoding_format"] = encoding_format
             else:
@@ -6911,7 +6779,9 @@ def speech(  # noqa: PLR0915
         if text_to_speech_provider_config is None:
             text_to_speech_provider_config = MinimaxTextToSpeechConfig()
 
-        minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config)
+        minimax_config = cast(
+            MinimaxTextToSpeechConfig, text_to_speech_provider_config
+        )
 
         if api_base is not None:
             litellm_params_dict["api_base"] = api_base
@@ -7051,7 +6921,7 @@ async def ahealth_check(
         custom_llm_provider_from_params = model_params.get("custom_llm_provider", None)
         api_base_from_params = model_params.get("api_base", None)
         api_key_from_params = model_params.get("api_key", None)
-
+        
         model, custom_llm_provider, _, _ = get_llm_provider(
             model=model,
             custom_llm_provider=custom_llm_provider_from_params,
@@ -7429,7 +7299,6 @@ def __getattr__(name: str) -> Any:
         _encoding = tiktoken.get_encoding("cl100k_base")
         # Cache it in the module's __dict__ for subsequent accesses
         import sys
-
         sys.modules[__name__].__dict__["encoding"] = _encoding
         global _encoding_cache
         _encoding_cache = _encoding
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index e71fcac389..f5e217d8b4 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -125,14 +125,13 @@ class SearchContextCostPerQuery(TypedDict, total=False):
 class AgenticLoopParams(TypedDict, total=False):
     """
     Parameters passed to agentic loop hooks (e.g., WebSearch interception).
-
+    
     Stored in logging_obj.model_call_details["agentic_loop_params"] to provide
     agentic hooks with the original request context needed for follow-up calls.
     """
-
     model: str
     """The model string with provider prefix (e.g., 'bedrock/invoke/...')"""
-
+    
     custom_llm_provider: str
     """The LLM provider name (e.g., 'bedrock', 'anthropic')"""
 
@@ -2925,9 +2924,6 @@ all_litellm_params = (
         "shared_session",
         "search_tool_name",
         "order",
-        "retry_delay",
-        "exponential_backoff",
-        "jitter",
     ]
     + list(StandardCallbackDynamicParams.__annotations__.keys())
     + list(CustomPricingLiteLLMParams.model_fields.keys())