docs: usaged-based routing perf warnings (#14080)

This commit is contained in:
Mubashir Osmani
2025-08-29 20:31:12 -04:00
committed by GitHub
parent 86c5de17bc
commit 38df2500b6
5 changed files with 154 additions and 145 deletions
+3 -3
View File
@@ -53,8 +53,8 @@ model_list = [
},
]
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="simple-shuffle", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="simple-shuffle", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
@@ -142,7 +142,7 @@ router_settings:
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
routing_strategy: usage-based-routing-v2
routing_strategy: simple-shuffle # recommended for best performance
```
### 2. Start proxy 2 instances
@@ -236,7 +236,7 @@ Most values can also be set via `litellm_settings`. If you see overlapping value
```yaml
router_settings:
routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - RECOMMENDED for best performance
redis_host: <your-redis-host> # string
redis_password: <your-redis-password> # string
redis_port: <your-redis-port> # string
+4 -1
View File
@@ -90,7 +90,7 @@ Recommended to do this for prod:
```yaml
router_settings:
routing_strategy: usage-based-routing-v2
routing_strategy: simple-shuffle # (default) - recommended for best performance
# redis_url: "os.environ/REDIS_URL"
redis_host: os.environ/REDIS_HOST
redis_port: os.environ/REDIS_PORT
@@ -105,6 +105,9 @@ litellm_settings:
password: os.environ/REDIS_PASSWORD
```
> **WARNING**
**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios.
## 5. Disable 'load_dotenv'
Set `export LITELLM_MODE="PRODUCTION"`
+145 -139
View File
@@ -154,11 +154,153 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
## Advanced - Routing Strategies ⭐️
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
Router provides 4 strategies for routing your calls across multiple deployments:
Router provides multiple strategies for routing your calls across multiple deployments. **We recommend using `simple-shuffle` (default) for best performance in production.**
<Tabs>
<TabItem value="simple-shuffle" label="(Default) Weighted Pick - RECOMMENDED">
**Default and Recommended for Production** - Best performance with minimal latency overhead.
Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
If `rpm` or `tpm` is not provided, it randomly picks a deployment
You can also set a `weight` param, to specify which model should get picked when.
<Tabs>
<TabItem value="rpm" label="RPM-based shuffling">
##### **LiteLLM Proxy Config.yaml**
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
rpm: 900
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-functioncalling
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
rpm: 10
```
##### **Python SDK**
```python
from litellm import Router
import asyncio
model_list = [{ # list of model deployments
"model_name": "gpt-3.5-turbo", # model alias
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2", # actual model name
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"rpm": 900, # requests per minute for this API
}
}, {
"model_name": "gpt-3.5-turbo",
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"rpm": 10,
}
},]
# init router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
</TabItem>
<TabItem value="weight" label="Weight-based shuffling">
##### **LiteLLM Proxy Config.yaml**
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
weight: 9
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-functioncalling
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
weight: 1
```
##### **Python SDK**
```python
from litellm import Router
import asyncio
model_list = [{
"model_name": "gpt-3.5-turbo", # model alias
"litellm_params": {
"model": "azure/chatgpt-v-2", # actual model name
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"weight": 9, # pick this 90% of the time
}
}, {
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"weight": 1,
}
}]
# init router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="usage-based-v2" label="Rate-Limit Aware v2 (ASYNC)">
> [!WARNING]
**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios. Usage-based routing adds significant latency due to Redis operations for tracking usage across deployments.
**🎉 NEW** This is an async implementation of usage-based-routing.
**Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits.
@@ -209,7 +351,7 @@ router = Router(model_list=model_list,
redis_host=os.environ["REDIS_HOST"],
redis_password=os.environ["REDIS_PASSWORD"],
redis_port=os.environ["REDIS_PORT"],
routing_strategy="usage-based-routing-v2" # 👈 KEY CHANGE
routing_strategy="simple-shuffle" # 👈 RECOMMENDED - best performance
enable_pre_call_checks=True, # enables router rate limits for concurrent calls
)
@@ -241,7 +383,7 @@ model_list:
rpm: 1000
router_settings:
routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
routing_strategy: simple-shuffle # 👈 RECOMMENDED - best performance
redis_host: <your-redis-host>
redis_password: <your-redis-password>
redis_port: <your-redis-port>
@@ -365,143 +507,7 @@ router_settings:
```
</TabItem>
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
If `rpm` or `tpm` is not provided, it randomly picks a deployment
You can also set a `weight` param, to specify which model should get picked when.
<Tabs>
<TabItem value="rpm" label="RPM-based shuffling">
##### **LiteLLM Proxy Config.yaml**
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
rpm: 900
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-functioncalling
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
rpm: 10
```
##### **Python SDK**
```python
from litellm import Router
import asyncio
model_list = [{ # list of model deployments
"model_name": "gpt-3.5-turbo", # model alias
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2", # actual model name
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"rpm": 900, # requests per minute for this API
}
}, {
"model_name": "gpt-3.5-turbo",
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"rpm": 10,
}
},]
# init router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
</TabItem>
<TabItem value="weight" label="Weight-based shuffling">
##### **LiteLLM Proxy Config.yaml**
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
weight: 9
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-functioncalling
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
api_base: os.environ/AZURE_API_BASE
weight: 1
```
##### **Python SDK**
```python
from litellm import Router
import asyncio
model_list = [{
"model_name": "gpt-3.5-turbo", # model alias
"litellm_params": {
"model": "azure/chatgpt-v-2", # actual model name
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"weight": 9, # pick this 90% of the time
}
}, {
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"weight": 1,
}
}]
# init router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="usage-based" label="Rate-Limit Aware">
This will route to the deployment with the lowest TPM usage for that minute.
+1 -1
View File
@@ -41,7 +41,7 @@ router = Router(
},
],
timeout=2, # timeout request if takes > 2s
routing_strategy="usage-based-routing-v2",
routing_strategy="simple-shuffle", # recommended for best performance
polling_interval=0.03 # poll queue every 3ms if no healthy deployments
)