mirror of
https://github.com/tiennm99/litellm.git
synced 2026-07-05 13:07:08 +00:00
Merge pull request #19402 from BerriAI/revert-19371-feat/configurable-retry-logic
Revert "feat: add retry_delay, exponential_backoff, and jitter to completion()"
This commit is contained in:
@@ -264,12 +264,6 @@ messages=[{"role": "user", "content": [
|
||||
|
||||
- `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs
|
||||
|
||||
- `retry_delay`: *float (optional)* - Time in seconds to wait between retries.
|
||||
|
||||
- `exponential_backoff`: *float (optional)* - If true, wait time doubles after each failure (1s, 2s, 4s...)
|
||||
|
||||
- `jitter`: *float (optional)* - If true, adds randomness to the wait time to prevent thundering herd.
|
||||
|
||||
- `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error
|
||||
|
||||
- `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails
|
||||
|
||||
@@ -31,36 +31,6 @@ response = completion(
|
||||
)
|
||||
```
|
||||
|
||||
### Configurable Retries (Exponential Backoff, Jitter)
|
||||
|
||||
You can also specify the `retry_delay` and `exponential_backoff` or `jitter` to the completion call.
|
||||
|
||||
* `retry_delay`: (float) Time in seconds to wait between retries.
|
||||
* `exponential_backoff`: (bool) If true, wait time doubles after each failure (1s, 2s, 4s...).
|
||||
* `jitter`: (bool) If true, adds randomness to the wait time to prevent thundering herd.
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
# Exponential Backoff
|
||||
response = litellm.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
num_retries=3,
|
||||
retry_delay=1.0, # Start with 1s wait
|
||||
exponential_backoff=True # Wait 1s, 2s, 4s...
|
||||
)
|
||||
|
||||
# Jitter
|
||||
response = litellm.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
num_retries=3,
|
||||
retry_delay=1.0,
|
||||
jitter=True # Randomize wait times
|
||||
)
|
||||
```
|
||||
|
||||
## Fallbacks (SDK)
|
||||
|
||||
:::info
|
||||
|
||||
@@ -64,9 +64,6 @@ def get_litellm_params(
|
||||
api_version: Optional[str] = None,
|
||||
max_retries: Optional[int] = None,
|
||||
litellm_request_debug: Optional[bool] = None,
|
||||
retry_delay: Optional[float] = None,
|
||||
exponential_backoff: Optional[bool] = None,
|
||||
jitter: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
litellm_params = {
|
||||
@@ -119,9 +116,6 @@ def get_litellm_params(
|
||||
"azure_password": kwargs.get("azure_password"),
|
||||
"azure_scope": kwargs.get("azure_scope"),
|
||||
"max_retries": max_retries,
|
||||
"retry_delay": retry_delay,
|
||||
"exponential_backoff": exponential_backoff,
|
||||
"jitter": jitter,
|
||||
"timeout": kwargs.get("timeout"),
|
||||
"bucket_name": kwargs.get("bucket_name"),
|
||||
"vertex_credentials": kwargs.get("vertex_credentials"),
|
||||
|
||||
+25
-156
@@ -415,11 +415,6 @@ async def acompletion(
|
||||
web_search_options: Optional[OpenAIWebSearchOptions] = None,
|
||||
# Session management
|
||||
shared_session: Optional["ClientSession"] = None,
|
||||
# Retry params
|
||||
retry_delay: Optional[float] = None,
|
||||
exponential_backoff: Optional[bool] = None,
|
||||
jitter: Optional[bool] = None,
|
||||
num_retries: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Union[ModelResponse, CustomStreamWrapper]:
|
||||
"""
|
||||
@@ -465,16 +460,6 @@ async def acompletion(
|
||||
- The `completion` function is called using `run_in_executor` to execute synchronously in the event loop.
|
||||
- If `stream` is True, the function returns an async generator that yields completion lines.
|
||||
"""
|
||||
if _update_kwargs_with_retry_params(
|
||||
retry_delay,
|
||||
exponential_backoff,
|
||||
jitter,
|
||||
num_retries,
|
||||
locals().get("max_retries"),
|
||||
kwargs,
|
||||
):
|
||||
return await acompletion_with_retries(model=model, messages=messages, **kwargs)
|
||||
|
||||
fallbacks = kwargs.get("fallbacks", None)
|
||||
mock_timeout = kwargs.get("mock_timeout", None)
|
||||
|
||||
@@ -1007,32 +992,8 @@ def _drop_input_examples_from_tools(
|
||||
return cleaned_tools
|
||||
|
||||
|
||||
def _update_kwargs_with_retry_params(
|
||||
retry_delay: Optional[float],
|
||||
exponential_backoff: Optional[bool],
|
||||
jitter: Optional[bool],
|
||||
num_retries: Optional[int],
|
||||
max_retries: Optional[int],
|
||||
kwargs: dict,
|
||||
) -> bool:
|
||||
"""
|
||||
Updates kwargs with retry parameters if any are provided.
|
||||
Returns True if retry logic should be triggered, False otherwise.
|
||||
"""
|
||||
if retry_delay is not None or exponential_backoff is not None or jitter is not None:
|
||||
kwargs["retry_delay"] = retry_delay
|
||||
kwargs["exponential_backoff"] = exponential_backoff
|
||||
kwargs["jitter"] = jitter
|
||||
|
||||
if num_retries is not None:
|
||||
kwargs["num_retries"] = num_retries
|
||||
elif max_retries is not None and "num_retries" not in kwargs:
|
||||
kwargs["num_retries"] = max_retries
|
||||
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@tracer.wrap()
|
||||
@client
|
||||
def completion( # type: ignore # noqa: PLR0915
|
||||
model: str,
|
||||
# Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
|
||||
@@ -1082,11 +1043,6 @@ def completion( # type: ignore # noqa: PLR0915
|
||||
thinking: Optional[AnthropicThinkingParam] = None,
|
||||
# Session management
|
||||
shared_session: Optional["ClientSession"] = None,
|
||||
# Retry params
|
||||
retry_delay: Optional[float] = None,
|
||||
exponential_backoff: Optional[bool] = None,
|
||||
jitter: Optional[bool] = None,
|
||||
num_retries: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Union[ModelResponse, CustomStreamWrapper]:
|
||||
"""
|
||||
@@ -1134,20 +1090,6 @@ def completion( # type: ignore # noqa: PLR0915
|
||||
- It supports various optional parameters for customizing the completion behavior.
|
||||
- If 'mock_response' is provided, a mock completion response is returned for testing or debugging.
|
||||
"""
|
||||
if _update_kwargs_with_retry_params(
|
||||
retry_delay,
|
||||
exponential_backoff,
|
||||
jitter,
|
||||
num_retries,
|
||||
locals().get("max_retries"),
|
||||
kwargs,
|
||||
):
|
||||
# check if this is an async call (acompletion=True in kwargs)
|
||||
if kwargs.get("acompletion", False) is True:
|
||||
return acompletion_with_retries(model=model, messages=messages, **kwargs)
|
||||
|
||||
return completion_with_retries(model=model, messages=messages, **kwargs)
|
||||
|
||||
### VALIDATE Request ###
|
||||
if model is None:
|
||||
raise ValueError("model param not passed in.")
|
||||
@@ -1173,9 +1115,7 @@ def completion( # type: ignore # noqa: PLR0915
|
||||
# Check if MCP tools are present (following responses pattern)
|
||||
# Cast tools to Optional[Iterable[ToolParam]] for type checking
|
||||
tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools)
|
||||
if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(
|
||||
tools=tools_for_mcp
|
||||
):
|
||||
if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp):
|
||||
# Return coroutine - acompletion will await it
|
||||
# completion() can return a coroutine when MCP tools are present, which acompletion() awaits
|
||||
return acompletion_with_mcp( # type: ignore[return-value]
|
||||
@@ -2405,7 +2345,11 @@ def completion( # type: ignore # noqa: PLR0915
|
||||
input=messages, api_key=api_key, original_response=response
|
||||
)
|
||||
elif custom_llm_provider == "minimax":
|
||||
api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key
|
||||
api_key = (
|
||||
api_key
|
||||
or get_secret_str("MINIMAX_API_KEY")
|
||||
or litellm.api_key
|
||||
)
|
||||
|
||||
api_base = (
|
||||
api_base
|
||||
@@ -2453,9 +2397,7 @@ def completion( # type: ignore # noqa: PLR0915
|
||||
or custom_llm_provider == "wandb"
|
||||
or custom_llm_provider == "clarifai"
|
||||
or custom_llm_provider in litellm.openai_compatible_providers
|
||||
or JSONProviderRegistry.exists(
|
||||
custom_llm_provider
|
||||
) # JSON-configured providers
|
||||
or JSONProviderRegistry.exists(custom_llm_provider) # JSON-configured providers
|
||||
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
||||
): # allow user to make an openai call with a custom base
|
||||
# note: if a user sets a custom base - we should ensure this works
|
||||
@@ -4295,51 +4237,24 @@ def completion_with_retries(*args, **kwargs):
|
||||
retry_strategy: Literal["exponential_backoff_retry", "constant_retry"] = kwargs.pop(
|
||||
"retry_strategy", "constant_retry"
|
||||
) # type: ignore
|
||||
retry_delay = kwargs.pop("retry_delay", None)
|
||||
exponential_backoff = kwargs.pop("exponential_backoff", False)
|
||||
jitter = kwargs.pop("jitter", False)
|
||||
|
||||
original_function = kwargs.pop("original_function", completion)
|
||||
|
||||
# +1 because stop_after_attempt includes the initial attempt
|
||||
stop_after = tenacity.stop_after_attempt(num_retries + 1)
|
||||
|
||||
if retry_strategy == "exponential_backoff_retry" or exponential_backoff:
|
||||
# Defaults for exponential backoff
|
||||
multiplier = 1
|
||||
min_wait = 0
|
||||
if retry_delay is not None:
|
||||
multiplier = retry_delay
|
||||
min_wait = retry_delay
|
||||
|
||||
wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10}
|
||||
|
||||
if jitter:
|
||||
wait_strategy = tenacity.wait_random_exponential(**wait_args)
|
||||
else:
|
||||
wait_strategy = tenacity.wait_exponential(**wait_args)
|
||||
|
||||
if retry_strategy == "exponential_backoff_retry":
|
||||
retryer = tenacity.Retrying(
|
||||
wait=wait_strategy,
|
||||
stop=stop_after,
|
||||
wait=tenacity.wait_exponential(multiplier=1, max=10),
|
||||
stop=tenacity.stop_after_attempt(num_retries),
|
||||
reraise=True,
|
||||
)
|
||||
else:
|
||||
wait_strategy = tenacity.wait_none()
|
||||
if retry_delay:
|
||||
wait_strategy = tenacity.wait_fixed(retry_delay)
|
||||
|
||||
retryer = tenacity.Retrying(
|
||||
wait=wait_strategy,
|
||||
stop=stop_after,
|
||||
reraise=True,
|
||||
stop=tenacity.stop_after_attempt(num_retries), reraise=True
|
||||
)
|
||||
return retryer(original_function, *args, **kwargs)
|
||||
|
||||
|
||||
async def acompletion_with_retries(*args, **kwargs):
|
||||
"""
|
||||
Executes a litellm.completion() with retries.
|
||||
[DEPRECATED]. Use 'acompletion' or router.acompletion instead!
|
||||
Executes a litellm.completion() with 3 retries
|
||||
"""
|
||||
try:
|
||||
import tenacity
|
||||
@@ -4352,65 +4267,18 @@ async def acompletion_with_retries(*args, **kwargs):
|
||||
kwargs["max_retries"] = 0
|
||||
kwargs["num_retries"] = 0
|
||||
retry_strategy = kwargs.pop("retry_strategy", "constant_retry")
|
||||
retry_delay = kwargs.pop("retry_delay", None)
|
||||
exponential_backoff = kwargs.pop("exponential_backoff", False)
|
||||
jitter = kwargs.pop("jitter", False)
|
||||
original_function = kwargs.pop("original_function", completion)
|
||||
|
||||
# +1 because stop_after_attempt includes the initial attempt
|
||||
stop_after = tenacity.stop_after_attempt(num_retries + 1)
|
||||
|
||||
# If the original function is completion but we are doing async retries
|
||||
# we need to ensure it's treated as an async function if it returns a coroutine
|
||||
# or wraps it.
|
||||
# However, since we are in acompletion_with_retries, we expect to be waiting.
|
||||
# If original_function is completion(acompletion=True), it returns a coro.
|
||||
# Tenacity AsyncRetrying expects the function to be awaitable or return awaitable?
|
||||
# Actually AsyncRetrying works with async def functions.
|
||||
# If original_function is sync (but returns coro), we might need a wrapper.
|
||||
|
||||
async def _async_original_function(*args, **kwargs):
|
||||
# Ensure we await the result if it is a coroutine
|
||||
result = original_function(*args, **kwargs)
|
||||
if asyncio.iscoroutine(result):
|
||||
return await result
|
||||
return result
|
||||
|
||||
if retry_strategy == "exponential_backoff_retry" or exponential_backoff:
|
||||
# Defaults for exponential backoff
|
||||
multiplier = 1
|
||||
min_wait = 0
|
||||
if retry_delay is not None:
|
||||
multiplier = retry_delay
|
||||
min_wait = retry_delay
|
||||
|
||||
wait_args = {"multiplier": multiplier, "min": min_wait, "max": 10}
|
||||
|
||||
if jitter:
|
||||
# Use wait_random_exponential if available or combine
|
||||
# Using wait_exponential + wait_random is one way, or wait_random_exponential
|
||||
# tenacity.wait_random_exponential(multiplier=1, max=10)
|
||||
wait_strategy = tenacity.wait_random_exponential(**wait_args)
|
||||
else:
|
||||
wait_strategy = tenacity.wait_exponential(**wait_args)
|
||||
|
||||
if retry_strategy == "exponential_backoff_retry":
|
||||
retryer = tenacity.AsyncRetrying(
|
||||
wait=wait_strategy,
|
||||
stop=stop_after,
|
||||
wait=tenacity.wait_exponential(multiplier=1, max=10),
|
||||
stop=tenacity.stop_after_attempt(num_retries),
|
||||
reraise=True,
|
||||
)
|
||||
else:
|
||||
wait_strategy = tenacity.wait_none()
|
||||
if retry_delay:
|
||||
wait_strategy = tenacity.wait_fixed(retry_delay)
|
||||
|
||||
retryer = tenacity.AsyncRetrying(
|
||||
wait=wait_strategy,
|
||||
stop=stop_after,
|
||||
reraise=True,
|
||||
stop=tenacity.stop_after_attempt(num_retries), reraise=True
|
||||
)
|
||||
|
||||
return await retryer(_async_original_function, *args, **kwargs)
|
||||
return await retryer(original_function, *args, **kwargs)
|
||||
|
||||
|
||||
def responses_with_retries(*args, **kwargs):
|
||||
@@ -4848,7 +4716,7 @@ def embedding( # noqa: PLR0915
|
||||
|
||||
if headers is not None and headers != {}:
|
||||
optional_params["extra_headers"] = headers
|
||||
|
||||
|
||||
if encoding_format is not None:
|
||||
optional_params["encoding_format"] = encoding_format
|
||||
else:
|
||||
@@ -6911,7 +6779,9 @@ def speech( # noqa: PLR0915
|
||||
if text_to_speech_provider_config is None:
|
||||
text_to_speech_provider_config = MinimaxTextToSpeechConfig()
|
||||
|
||||
minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config)
|
||||
minimax_config = cast(
|
||||
MinimaxTextToSpeechConfig, text_to_speech_provider_config
|
||||
)
|
||||
|
||||
if api_base is not None:
|
||||
litellm_params_dict["api_base"] = api_base
|
||||
@@ -7051,7 +6921,7 @@ async def ahealth_check(
|
||||
custom_llm_provider_from_params = model_params.get("custom_llm_provider", None)
|
||||
api_base_from_params = model_params.get("api_base", None)
|
||||
api_key_from_params = model_params.get("api_key", None)
|
||||
|
||||
|
||||
model, custom_llm_provider, _, _ = get_llm_provider(
|
||||
model=model,
|
||||
custom_llm_provider=custom_llm_provider_from_params,
|
||||
@@ -7429,7 +7299,6 @@ def __getattr__(name: str) -> Any:
|
||||
_encoding = tiktoken.get_encoding("cl100k_base")
|
||||
# Cache it in the module's __dict__ for subsequent accesses
|
||||
import sys
|
||||
|
||||
sys.modules[__name__].__dict__["encoding"] = _encoding
|
||||
global _encoding_cache
|
||||
_encoding_cache = _encoding
|
||||
|
||||
@@ -125,14 +125,13 @@ class SearchContextCostPerQuery(TypedDict, total=False):
|
||||
class AgenticLoopParams(TypedDict, total=False):
|
||||
"""
|
||||
Parameters passed to agentic loop hooks (e.g., WebSearch interception).
|
||||
|
||||
|
||||
Stored in logging_obj.model_call_details["agentic_loop_params"] to provide
|
||||
agentic hooks with the original request context needed for follow-up calls.
|
||||
"""
|
||||
|
||||
model: str
|
||||
"""The model string with provider prefix (e.g., 'bedrock/invoke/...')"""
|
||||
|
||||
|
||||
custom_llm_provider: str
|
||||
"""The LLM provider name (e.g., 'bedrock', 'anthropic')"""
|
||||
|
||||
@@ -2925,9 +2924,6 @@ all_litellm_params = (
|
||||
"shared_session",
|
||||
"search_tool_name",
|
||||
"order",
|
||||
"retry_delay",
|
||||
"exponential_backoff",
|
||||
"jitter",
|
||||
]
|
||||
+ list(StandardCallbackDynamicParams.__annotations__.keys())
|
||||
+ list(CustomPricingLiteLLMParams.model_fields.keys())
|
||||
|
||||
Reference in New Issue
Block a user