From ae414ed4627cf32a3cdfb15ef0eb42779b219ede Mon Sep 17 00:00:00 2001 From: Sameer Kankute Date: Tue, 20 Jan 2026 17:07:00 +0530 Subject: [PATCH] =?UTF-8?q?Revert=20"feat:=20add=20retry=5Fdelay,=20expone?= =?UTF-8?q?ntial=5Fbackoff,=20and=20jitter=20to=20completion(=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1678f621db8305c7e5f9366f5ee46c7f4edb9a47. --- docs/my-website/docs/completion/input.md | 6 - .../docs/completion/reliable_completions.md | 30 --- .../litellm_core_utils/get_litellm_params.py | 6 - litellm/main.py | 181 +++--------------- litellm/types/utils.py | 8 +- 5 files changed, 27 insertions(+), 204 deletions(-) diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index 1272c7eb16..2f6da4bedc 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -264,12 +264,6 @@ messages=[{"role": "user", "content": [ - `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs -- `retry_delay`: *float (optional)* - Time in seconds to wait between retries. - -- `exponential_backoff`: *float (optional)* - If true, wait time doubles after each failure (1s, 2s, 4s...) - -- `jitter`: *float (optional)* - If true, adds randomness to the wait time to prevent thundering herd. - - `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error - `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails diff --git a/docs/my-website/docs/completion/reliable_completions.md b/docs/my-website/docs/completion/reliable_completions.md index 431df66c9d..f38917fe53 100644 --- a/docs/my-website/docs/completion/reliable_completions.md +++ b/docs/my-website/docs/completion/reliable_completions.md @@ -31,36 +31,6 @@ response = completion( ) ``` -### Configurable Retries (Exponential Backoff, Jitter) - -You can also specify the `retry_delay` and `exponential_backoff` or `jitter` to the completion call. - -* `retry_delay`: (float) Time in seconds to wait between retries. -* `exponential_backoff`: (bool) If true, wait time doubles after each failure (1s, 2s, 4s...). -* `jitter`: (bool) If true, adds randomness to the wait time to prevent thundering herd. - -```python -import litellm - -# Exponential Backoff -response = litellm.completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hi"}], - num_retries=3, - retry_delay=1.0, # Start with 1s wait - exponential_backoff=True # Wait 1s, 2s, 4s... -) - -# Jitter -response = litellm.completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hi"}], - num_retries=3, - retry_delay=1.0, - jitter=True # Randomize wait times -) -``` - ## Fallbacks (SDK) :::info diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py index e9740ed1da..0d35cfa314 100644 --- a/litellm/litellm_core_utils/get_litellm_params.py +++ b/litellm/litellm_core_utils/get_litellm_params.py @@ -64,9 +64,6 @@ def get_litellm_params( api_version: Optional[str] = None, max_retries: Optional[int] = None, litellm_request_debug: Optional[bool] = None, - retry_delay: Optional[float] = None, - exponential_backoff: Optional[bool] = None, - jitter: Optional[bool] = None, **kwargs, ) -> dict: litellm_params = { @@ -119,9 +116,6 @@ def get_litellm_params( "azure_password": kwargs.get("azure_password"), "azure_scope": kwargs.get("azure_scope"), "max_retries": max_retries, - "retry_delay": retry_delay, - "exponential_backoff": exponential_backoff, - "jitter": jitter, "timeout": kwargs.get("timeout"), "bucket_name": kwargs.get("bucket_name"), "vertex_credentials": kwargs.get("vertex_credentials"), diff --git a/litellm/main.py b/litellm/main.py index e62484e40f..e199aa3001 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -415,11 +415,6 @@ async def acompletion( web_search_options: Optional[OpenAIWebSearchOptions] = None, # Session management shared_session: Optional["ClientSession"] = None, - # Retry params - retry_delay: Optional[float] = None, - exponential_backoff: Optional[bool] = None, - jitter: Optional[bool] = None, - num_retries: Optional[int] = None, **kwargs, ) -> Union[ModelResponse, CustomStreamWrapper]: """ @@ -465,16 +460,6 @@ async def acompletion( - The `completion` function is called using `run_in_executor` to execute synchronously in the event loop. - If `stream` is True, the function returns an async generator that yields completion lines. """ - if _update_kwargs_with_retry_params( - retry_delay, - exponential_backoff, - jitter, - num_retries, - locals().get("max_retries"), - kwargs, - ): - return await acompletion_with_retries(model=model, messages=messages, **kwargs) - fallbacks = kwargs.get("fallbacks", None) mock_timeout = kwargs.get("mock_timeout", None) @@ -1007,32 +992,8 @@ def _drop_input_examples_from_tools( return cleaned_tools -def _update_kwargs_with_retry_params( - retry_delay: Optional[float], - exponential_backoff: Optional[bool], - jitter: Optional[bool], - num_retries: Optional[int], - max_retries: Optional[int], - kwargs: dict, -) -> bool: - """ - Updates kwargs with retry parameters if any are provided. - Returns True if retry logic should be triggered, False otherwise. - """ - if retry_delay is not None or exponential_backoff is not None or jitter is not None: - kwargs["retry_delay"] = retry_delay - kwargs["exponential_backoff"] = exponential_backoff - kwargs["jitter"] = jitter - - if num_retries is not None: - kwargs["num_retries"] = num_retries - elif max_retries is not None and "num_retries" not in kwargs: - kwargs["num_retries"] = max_retries - - return True - return False - - +@tracer.wrap() +@client def completion( # type: ignore # noqa: PLR0915 model: str, # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create @@ -1082,11 +1043,6 @@ def completion( # type: ignore # noqa: PLR0915 thinking: Optional[AnthropicThinkingParam] = None, # Session management shared_session: Optional["ClientSession"] = None, - # Retry params - retry_delay: Optional[float] = None, - exponential_backoff: Optional[bool] = None, - jitter: Optional[bool] = None, - num_retries: Optional[int] = None, **kwargs, ) -> Union[ModelResponse, CustomStreamWrapper]: """ @@ -1134,20 +1090,6 @@ def completion( # type: ignore # noqa: PLR0915 - It supports various optional parameters for customizing the completion behavior. - If 'mock_response' is provided, a mock completion response is returned for testing or debugging. """ - if _update_kwargs_with_retry_params( - retry_delay, - exponential_backoff, - jitter, - num_retries, - locals().get("max_retries"), - kwargs, - ): - # check if this is an async call (acompletion=True in kwargs) - if kwargs.get("acompletion", False) is True: - return acompletion_with_retries(model=model, messages=messages, **kwargs) - - return completion_with_retries(model=model, messages=messages, **kwargs) - ### VALIDATE Request ### if model is None: raise ValueError("model param not passed in.") @@ -1173,9 +1115,7 @@ def completion( # type: ignore # noqa: PLR0915 # Check if MCP tools are present (following responses pattern) # Cast tools to Optional[Iterable[ToolParam]] for type checking tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools) - if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway( - tools=tools_for_mcp - ): + if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp): # Return coroutine - acompletion will await it # completion() can return a coroutine when MCP tools are present, which acompletion() awaits return acompletion_with_mcp( # type: ignore[return-value] @@ -2405,7 +2345,11 @@ def completion( # type: ignore # noqa: PLR0915 input=messages, api_key=api_key, original_response=response ) elif custom_llm_provider == "minimax": - api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key + api_key = ( + api_key + or get_secret_str("MINIMAX_API_KEY") + or litellm.api_key + ) api_base = ( api_base @@ -2453,9 +2397,7 @@ def completion( # type: ignore # noqa: PLR0915 or custom_llm_provider == "wandb" or custom_llm_provider == "clarifai" or custom_llm_provider in litellm.openai_compatible_providers - or JSONProviderRegistry.exists( - custom_llm_provider - ) # JSON-configured providers + or JSONProviderRegistry.exists(custom_llm_provider) # JSON-configured providers or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo ): # allow user to make an openai call with a custom base # note: if a user sets a custom base - we should ensure this works @@ -4295,51 +4237,24 @@ def completion_with_retries(*args, **kwargs): retry_strategy: Literal["exponential_backoff_retry", "constant_retry"] = kwargs.pop( "retry_strategy", "constant_retry" ) # type: ignore - retry_delay = kwargs.pop("retry_delay", None) - exponential_backoff = kwargs.pop("exponential_backoff", False) - jitter = kwargs.pop("jitter", False) - original_function = kwargs.pop("original_function", completion) - - # +1 because stop_after_attempt includes the initial attempt - stop_after = tenacity.stop_after_attempt(num_retries + 1) - - if retry_strategy == "exponential_backoff_retry" or exponential_backoff: - # Defaults for exponential backoff - multiplier = 1 - min_wait = 0 - if retry_delay is not None: - multiplier = retry_delay - min_wait = retry_delay - - wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10} - - if jitter: - wait_strategy = tenacity.wait_random_exponential(**wait_args) - else: - wait_strategy = tenacity.wait_exponential(**wait_args) - + if retry_strategy == "exponential_backoff_retry": retryer = tenacity.Retrying( - wait=wait_strategy, - stop=stop_after, + wait=tenacity.wait_exponential(multiplier=1, max=10), + stop=tenacity.stop_after_attempt(num_retries), reraise=True, ) else: - wait_strategy = tenacity.wait_none() - if retry_delay: - wait_strategy = tenacity.wait_fixed(retry_delay) - retryer = tenacity.Retrying( - wait=wait_strategy, - stop=stop_after, - reraise=True, + stop=tenacity.stop_after_attempt(num_retries), reraise=True ) return retryer(original_function, *args, **kwargs) async def acompletion_with_retries(*args, **kwargs): """ - Executes a litellm.completion() with retries. + [DEPRECATED]. Use 'acompletion' or router.acompletion instead! + Executes a litellm.completion() with 3 retries """ try: import tenacity @@ -4352,65 +4267,18 @@ async def acompletion_with_retries(*args, **kwargs): kwargs["max_retries"] = 0 kwargs["num_retries"] = 0 retry_strategy = kwargs.pop("retry_strategy", "constant_retry") - retry_delay = kwargs.pop("retry_delay", None) - exponential_backoff = kwargs.pop("exponential_backoff", False) - jitter = kwargs.pop("jitter", False) original_function = kwargs.pop("original_function", completion) - - # +1 because stop_after_attempt includes the initial attempt - stop_after = tenacity.stop_after_attempt(num_retries + 1) - - # If the original function is completion but we are doing async retries - # we need to ensure it's treated as an async function if it returns a coroutine - # or wraps it. - # However, since we are in acompletion_with_retries, we expect to be waiting. - # If original_function is completion(acompletion=True), it returns a coro. - # Tenacity AsyncRetrying expects the function to be awaitable or return awaitable? - # Actually AsyncRetrying works with async def functions. - # If original_function is sync (but returns coro), we might need a wrapper. - - async def _async_original_function(*args, **kwargs): - # Ensure we await the result if it is a coroutine - result = original_function(*args, **kwargs) - if asyncio.iscoroutine(result): - return await result - return result - - if retry_strategy == "exponential_backoff_retry" or exponential_backoff: - # Defaults for exponential backoff - multiplier = 1 - min_wait = 0 - if retry_delay is not None: - multiplier = retry_delay - min_wait = retry_delay - - wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10} - - if jitter: - # Use wait_random_exponential if available or combine - # Using wait_exponential + wait_random is one way, or wait_random_exponential - # tenacity.wait_random_exponential(multiplier=1, max=10) - wait_strategy = tenacity.wait_random_exponential(**wait_args) - else: - wait_strategy = tenacity.wait_exponential(**wait_args) - + if retry_strategy == "exponential_backoff_retry": retryer = tenacity.AsyncRetrying( - wait=wait_strategy, - stop=stop_after, + wait=tenacity.wait_exponential(multiplier=1, max=10), + stop=tenacity.stop_after_attempt(num_retries), reraise=True, ) else: - wait_strategy = tenacity.wait_none() - if retry_delay: - wait_strategy = tenacity.wait_fixed(retry_delay) - retryer = tenacity.AsyncRetrying( - wait=wait_strategy, - stop=stop_after, - reraise=True, + stop=tenacity.stop_after_attempt(num_retries), reraise=True ) - - return await retryer(_async_original_function, *args, **kwargs) + return await retryer(original_function, *args, **kwargs) def responses_with_retries(*args, **kwargs): @@ -4848,7 +4716,7 @@ def embedding( # noqa: PLR0915 if headers is not None and headers != {}: optional_params["extra_headers"] = headers - + if encoding_format is not None: optional_params["encoding_format"] = encoding_format else: @@ -6911,7 +6779,9 @@ def speech( # noqa: PLR0915 if text_to_speech_provider_config is None: text_to_speech_provider_config = MinimaxTextToSpeechConfig() - minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config) + minimax_config = cast( + MinimaxTextToSpeechConfig, text_to_speech_provider_config + ) if api_base is not None: litellm_params_dict["api_base"] = api_base @@ -7051,7 +6921,7 @@ async def ahealth_check( custom_llm_provider_from_params = model_params.get("custom_llm_provider", None) api_base_from_params = model_params.get("api_base", None) api_key_from_params = model_params.get("api_key", None) - + model, custom_llm_provider, _, _ = get_llm_provider( model=model, custom_llm_provider=custom_llm_provider_from_params, @@ -7429,7 +7299,6 @@ def __getattr__(name: str) -> Any: _encoding = tiktoken.get_encoding("cl100k_base") # Cache it in the module's __dict__ for subsequent accesses import sys - sys.modules[__name__].__dict__["encoding"] = _encoding global _encoding_cache _encoding_cache = _encoding diff --git a/litellm/types/utils.py b/litellm/types/utils.py index e71fcac389..f5e217d8b4 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -125,14 +125,13 @@ class SearchContextCostPerQuery(TypedDict, total=False): class AgenticLoopParams(TypedDict, total=False): """ Parameters passed to agentic loop hooks (e.g., WebSearch interception). - + Stored in logging_obj.model_call_details["agentic_loop_params"] to provide agentic hooks with the original request context needed for follow-up calls. """ - model: str """The model string with provider prefix (e.g., 'bedrock/invoke/...')""" - + custom_llm_provider: str """The LLM provider name (e.g., 'bedrock', 'anthropic')""" @@ -2925,9 +2924,6 @@ all_litellm_params = ( "shared_session", "search_tool_name", "order", - "retry_delay", - "exponential_backoff", - "jitter", ] + list(StandardCallbackDynamicParams.__annotations__.keys()) + list(CustomPricingLiteLLMParams.model_fields.keys())