diff --git a/docs/my-website/docs/load_test_rpm.md b/docs/my-website/docs/load_test_rpm.md index 0954ffcdfa..b7621a7646 100644 --- a/docs/my-website/docs/load_test_rpm.md +++ b/docs/my-website/docs/load_test_rpm.md @@ -53,8 +53,8 @@ model_list = [ }, ] -router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD")) -router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD")) +router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="simple-shuffle", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD")) +router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="simple-shuffle", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD")) @@ -142,7 +142,7 @@ router_settings: redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis redis_password: os.environ/REDIS_PASSWORD redis_port: os.environ/REDIS_PORT - routing_strategy: usage-based-routing-v2 + routing_strategy: simple-shuffle # recommended for best performance ``` ### 2. Start proxy 2 instances diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index a89654af0c..1b96c32bdb 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -236,7 +236,7 @@ Most values can also be set via `litellm_settings`. If you see overlapping value ```yaml router_settings: - routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - RECOMMENDED for best performance redis_host: # string redis_password: # string redis_port: # string diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md index fb2acf230c..a45474f39e 100644 --- a/docs/my-website/docs/proxy/prod.md +++ b/docs/my-website/docs/proxy/prod.md @@ -90,7 +90,7 @@ Recommended to do this for prod: ```yaml router_settings: - routing_strategy: usage-based-routing-v2 + routing_strategy: simple-shuffle # (default) - recommended for best performance # redis_url: "os.environ/REDIS_URL" redis_host: os.environ/REDIS_HOST redis_port: os.environ/REDIS_PORT @@ -105,6 +105,9 @@ litellm_settings: password: os.environ/REDIS_PASSWORD ``` +> **WARNING** +**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios. + ## 5. Disable 'load_dotenv' Set `export LITELLM_MODE="PRODUCTION"` diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index fbb069895d..971427806e 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -154,11 +154,153 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ ## Advanced - Routing Strategies ⭐️ #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based -Router provides 4 strategies for routing your calls across multiple deployments: +Router provides multiple strategies for routing your calls across multiple deployments. **We recommend using `simple-shuffle` (default) for best performance in production.** + + +**Default and Recommended for Production** - Best performance with minimal latency overhead. + +Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)** + +If `rpm` or `tpm` is not provided, it randomly picks a deployment + +You can also set a `weight` param, to specify which model should get picked when. + + + + +##### **LiteLLM Proxy Config.yaml** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/chatgpt-v-2 + api_key: os.environ/AZURE_API_KEY + api_version: os.environ/AZURE_API_VERSION + api_base: os.environ/AZURE_API_BASE + rpm: 900 + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/chatgpt-functioncalling + api_key: os.environ/AZURE_API_KEY + api_version: os.environ/AZURE_API_VERSION + api_base: os.environ/AZURE_API_BASE + rpm: 10 +``` + +##### **Python SDK** + +```python +from litellm import Router +import asyncio + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "rpm": 900, # requests per minute for this API + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "rpm": 10, + } +},] + +# init router +router = Router(model_list=model_list, routing_strategy="simple-shuffle") +async def router_acompletion(): + response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + ) + print(response) + return response + +asyncio.run(router_acompletion()) +``` + + + + +##### **LiteLLM Proxy Config.yaml** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/chatgpt-v-2 + api_key: os.environ/AZURE_API_KEY + api_version: os.environ/AZURE_API_VERSION + api_base: os.environ/AZURE_API_BASE + weight: 9 + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/chatgpt-functioncalling + api_key: os.environ/AZURE_API_KEY + api_version: os.environ/AZURE_API_VERSION + api_base: os.environ/AZURE_API_BASE + weight: 1 +``` + +##### **Python SDK** + +```python +from litellm import Router +import asyncio + +model_list = [{ + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { + "model": "azure/chatgpt-v-2", # actual model name + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "weight": 9, # pick this 90% of the time + } +}, { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + "weight": 1, + } +}] + +# init router +router = Router(model_list=model_list, routing_strategy="simple-shuffle") +async def router_acompletion(): + response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + ) + print(response) + return response + +asyncio.run(router_acompletion()) +``` + + + + + +> [!WARNING] +**Usage-based routing is not recommended for production due to performance impacts.** Use `simple-shuffle` (default) for optimal performance in high-traffic scenarios. Usage-based routing adds significant latency due to Redis operations for tracking usage across deployments. + + **🎉 NEW** This is an async implementation of usage-based-routing. **Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits. @@ -209,7 +351,7 @@ router = Router(model_list=model_list, redis_host=os.environ["REDIS_HOST"], redis_password=os.environ["REDIS_PASSWORD"], redis_port=os.environ["REDIS_PORT"], - routing_strategy="usage-based-routing-v2" # 👈 KEY CHANGE + routing_strategy="simple-shuffle" # 👈 RECOMMENDED - best performance enable_pre_call_checks=True, # enables router rate limits for concurrent calls ) @@ -241,7 +383,7 @@ model_list: rpm: 1000 router_settings: - routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE + routing_strategy: simple-shuffle # 👈 RECOMMENDED - best performance redis_host: redis_password: redis_port: @@ -365,143 +507,7 @@ router_settings: ``` - -**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)** - -If `rpm` or `tpm` is not provided, it randomly picks a deployment - -You can also set a `weight` param, to specify which model should get picked when. - - - - -##### **LiteLLM Proxy Config.yaml** - -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/chatgpt-v-2 - api_key: os.environ/AZURE_API_KEY - api_version: os.environ/AZURE_API_VERSION - api_base: os.environ/AZURE_API_BASE - rpm: 900 - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/chatgpt-functioncalling - api_key: os.environ/AZURE_API_KEY - api_version: os.environ/AZURE_API_VERSION - api_base: os.environ/AZURE_API_BASE - rpm: 10 -``` - -##### **Python SDK** - -```python -from litellm import Router -import asyncio - -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", # actual model name - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - "rpm": 900, # requests per minute for this API - } -}, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - "rpm": 10, - } -},] - -# init router -router = Router(model_list=model_list, routing_strategy="simple-shuffle") -async def router_acompletion(): - response = await router.acompletion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hey, how's it going?"}] - ) - print(response) - return response - -asyncio.run(router_acompletion()) -``` - - - - -##### **LiteLLM Proxy Config.yaml** - -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/chatgpt-v-2 - api_key: os.environ/AZURE_API_KEY - api_version: os.environ/AZURE_API_VERSION - api_base: os.environ/AZURE_API_BASE - weight: 9 - - model_name: gpt-3.5-turbo - litellm_params: - model: azure/chatgpt-functioncalling - api_key: os.environ/AZURE_API_KEY - api_version: os.environ/AZURE_API_VERSION - api_base: os.environ/AZURE_API_BASE - weight: 1 -``` - - -##### **Python SDK** - -```python -from litellm import Router -import asyncio - -model_list = [{ - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { - "model": "azure/chatgpt-v-2", # actual model name - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - "weight": 9, # pick this 90% of the time - } -}, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { - "model": "azure/chatgpt-functioncalling", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - "weight": 1, - } -}] - -# init router -router = Router(model_list=model_list, routing_strategy="simple-shuffle") -async def router_acompletion(): - response = await router.acompletion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hey, how's it going?"}] - ) - print(response) - return response - -asyncio.run(router_acompletion()) -``` - - - - - This will route to the deployment with the lowest TPM usage for that minute. diff --git a/docs/my-website/docs/scheduler.md b/docs/my-website/docs/scheduler.md index 2b0a582626..9b84c374e3 100644 --- a/docs/my-website/docs/scheduler.md +++ b/docs/my-website/docs/scheduler.md @@ -41,7 +41,7 @@ router = Router( }, ], timeout=2, # timeout request if takes > 2s - routing_strategy="usage-based-routing-v2", + routing_strategy="simple-shuffle", # recommended for best performance polling_interval=0.03 # poll queue every 3ms if no healthy deployments )