docs: usaged-based routing perf warnings (#14080)

2026-08-02 12:21:10 +00:00 · 2025-08-29 17:31:12 -07:00
parent 86c5de17bc
commit 38df2500b6
5 changed files with 154 additions and 145 deletions
@@ -53,8 +53,8 @@ model_list = [
    },
 ]

-router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
-router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="simple-shuffle", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="simple-shuffle", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))



@@ -142,7 +142,7 @@ router_settings:
  redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
  redis_password: os.environ/REDIS_PASSWORD
  redis_port: os.environ/REDIS_PORT
-  routing_strategy: usage-based-routing-v2
+  routing_strategy: simple-shuffle # recommended for best performance
 ```

 ### 2. Start proxy 2 instances
@@ -236,7 +236,7 @@ Most values can also be set via `litellm_settings`. If you see overlapping value

 ```yaml
 router_settings:
-  routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - RECOMMENDED for best performance
  redis_host: <your-redis-host>           # string
  redis_password: <your-redis-password>   # string
  redis_port: <your-redis-port>           # string
@@ -90,7 +90,7 @@ Recommended to do this for prod:

 ```yaml
 router_settings:
-  routing_strategy: usage-based-routing-v2 
+  routing_strategy: simple-shuffle # (default) - recommended for best performance
  # redis_url: "os.environ/REDIS_URL"
  redis_host: os.environ/REDIS_HOST
  redis_port: os.environ/REDIS_PORT
@@ -105,6 +105,9 @@ litellm_settings:
    password: os.environ/REDIS_PASSWORD
 ```

+> **WARNING**
+**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios.
+
 ## 5. Disable 'load_dotenv'

 Set `export LITELLM_MODE="PRODUCTION"`
@@ -154,11 +154,153 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 ## Advanced - Routing Strategies ⭐️
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based

-Router provides 4 strategies for routing your calls across multiple deployments: 
+Router provides multiple strategies for routing your calls across multiple deployments. **We recommend using `simple-shuffle` (default) for best performance in production.**

 <Tabs>
+<TabItem value="simple-shuffle" label="(Default) Weighted Pick - RECOMMENDED">
+
+**Default and Recommended for Production** - Best performance with minimal latency overhead.
+
+Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
+
+If `rpm` or `tpm` is not provided, it randomly picks a deployment
+
+You can also set a `weight` param, to specify which model should get picked when.
+
+<Tabs>
+<TabItem value="rpm" label="RPM-based shuffling">
+
+##### **LiteLLM Proxy Config.yaml**
+
+```yaml
+model_list:
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+	  	model: azure/chatgpt-v-2
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+		rpm: 900 
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+	  	model: azure/chatgpt-functioncalling
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+		rpm: 10 
+```
+
+##### **Python SDK**
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+		"rpm": 900,			# requests per minute for this API
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+		"rpm": 10,
+	}
+},]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+	return response
+
+asyncio.run(router_acompletion())
+```
+
+</TabItem>
+<TabItem value="weight" label="Weight-based shuffling">
+
+##### **LiteLLM Proxy Config.yaml**
+
+```yaml
+model_list:
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+	  	model: azure/chatgpt-v-2
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+		weight: 9
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+	  	model: azure/chatgpt-functioncalling
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+		weight: 1 
+```
+
+##### **Python SDK**
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list = [{
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+		"weight": 9, # pick this 90% of the time
+	}
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+		"weight": 1,
+	}
+}]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+	return response
+
+asyncio.run(router_acompletion())
+```
+
+</TabItem>
+</Tabs>
+
+</TabItem>
 <TabItem value="usage-based-v2" label="Rate-Limit Aware v2 (ASYNC)">

+> [!WARNING]  
+**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios. Usage-based routing adds significant latency due to Redis operations for tracking usage across deployments.
+
+
 **🎉 NEW** This is an async implementation of usage-based-routing.

 **Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits.
@@ -209,7 +351,7 @@ router = Router(model_list=model_list,
                redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="usage-based-routing-v2" # 👈 KEY CHANGE
+                routing_strategy="simple-shuffle" # 👈 RECOMMENDED - best performance
 				enable_pre_call_checks=True, # enables router rate limits for concurrent calls
 				)

@@ -241,7 +383,7 @@ model_list:
 	  rpm: 1000

 router_settings:
-  routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
+  routing_strategy: simple-shuffle # 👈 RECOMMENDED - best performance
  redis_host: <your-redis-host>
  redis_password: <your-redis-password>
  redis_port: <your-redis-port>
@@ -365,143 +507,7 @@ router_settings:
 ```

 </TabItem>
-<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">

-**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
-
-If `rpm` or `tpm` is not provided, it randomly picks a deployment
-
-You can also set a `weight` param, to specify which model should get picked when.
-
-<Tabs>
-<TabItem value="rpm" label="RPM-based shuffling">
-
-##### **LiteLLM Proxy Config.yaml**
-
-```yaml
-model_list:
-	- model_name: gpt-3.5-turbo
-	  litellm_params:
-	  	model: azure/chatgpt-v-2
-		api_key: os.environ/AZURE_API_KEY
-		api_version: os.environ/AZURE_API_VERSION
-		api_base: os.environ/AZURE_API_BASE
-		rpm: 900 
-	- model_name: gpt-3.5-turbo
-	  litellm_params:
-	  	model: azure/chatgpt-functioncalling
-		api_key: os.environ/AZURE_API_KEY
-		api_version: os.environ/AZURE_API_VERSION
-		api_base: os.environ/AZURE_API_BASE
-		rpm: 10 
-```
-
-##### **Python SDK**
-
-```python
-from litellm import Router 
-import asyncio
-
-model_list = [{ # list of model deployments 
-	"model_name": "gpt-3.5-turbo", # model alias 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-v-2", # actual model name
-		"api_key": os.getenv("AZURE_API_KEY"),
-		"api_version": os.getenv("AZURE_API_VERSION"),
-		"api_base": os.getenv("AZURE_API_BASE"),
-		"rpm": 900,			# requests per minute for this API
-	}
-}, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
-		"model": "azure/chatgpt-functioncalling", 
-		"api_key": os.getenv("AZURE_API_KEY"),
-		"api_version": os.getenv("AZURE_API_VERSION"),
-		"api_base": os.getenv("AZURE_API_BASE"),
-		"rpm": 10,
-	}
-},]
-
-# init router
-router = Router(model_list=model_list, routing_strategy="simple-shuffle")
-async def router_acompletion():
-	response = await router.acompletion(
-		model="gpt-3.5-turbo", 
-		messages=[{"role": "user", "content": "Hey, how's it going?"}]
-	)
-	print(response)
-	return response
-
-asyncio.run(router_acompletion())
-```
-
-</TabItem>
-<TabItem value="weight" label="Weight-based shuffling">
-
-##### **LiteLLM Proxy Config.yaml**
-
-```yaml
-model_list:
-	- model_name: gpt-3.5-turbo
-	  litellm_params:
-	  	model: azure/chatgpt-v-2
-		api_key: os.environ/AZURE_API_KEY
-		api_version: os.environ/AZURE_API_VERSION
-		api_base: os.environ/AZURE_API_BASE
-		weight: 9
-	- model_name: gpt-3.5-turbo
-	  litellm_params:
-	  	model: azure/chatgpt-functioncalling
-		api_key: os.environ/AZURE_API_KEY
-		api_version: os.environ/AZURE_API_VERSION
-		api_base: os.environ/AZURE_API_BASE
-		weight: 1 
-```
-
-
-##### **Python SDK**
-
-```python
-from litellm import Router 
-import asyncio
-
-model_list = [{
-	"model_name": "gpt-3.5-turbo", # model alias 
-	"litellm_params": { 
-		"model": "azure/chatgpt-v-2", # actual model name
-		"api_key": os.getenv("AZURE_API_KEY"),
-		"api_version": os.getenv("AZURE_API_VERSION"),
-		"api_base": os.getenv("AZURE_API_BASE"),
-		"weight": 9, # pick this 90% of the time
-	}
-}, {
-    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { 
-		"model": "azure/chatgpt-functioncalling", 
-		"api_key": os.getenv("AZURE_API_KEY"),
-		"api_version": os.getenv("AZURE_API_VERSION"),
-		"api_base": os.getenv("AZURE_API_BASE"),
-		"weight": 1,
-	}
-}]
-
-# init router
-router = Router(model_list=model_list, routing_strategy="simple-shuffle")
-async def router_acompletion():
-	response = await router.acompletion(
-		model="gpt-3.5-turbo", 
-		messages=[{"role": "user", "content": "Hey, how's it going?"}]
-	)
-	print(response)
-	return response
-
-asyncio.run(router_acompletion())
-```
-
-</TabItem>
-</Tabs>
-
-</TabItem>
 <TabItem value="usage-based" label="Rate-Limit Aware">

 This will route to the deployment with the lowest TPM usage for that minute. 
@@ -41,7 +41,7 @@ router = Router(
        },
    ],
    timeout=2, # timeout request if takes > 2s
-    routing_strategy="usage-based-routing-v2",
+    routing_strategy="simple-shuffle", # recommended for best performance
    polling_interval=0.03 # poll queue every 3ms if no healthy deployments
 )