Merge branch 'main' into litellm_v1_messages_claude_4_6

This commit is contained in:
Sameer Kankute
2026-02-09 17:14:36 +05:30
committed by GitHub
54 changed files with 2804 additions and 266 deletions
+1 -1
View File
@@ -48,7 +48,7 @@ dist/
build/
*.egg-info/
.DS_Store
node_modules/
**/node_modules
*.log
.env
.env.local
+30 -1
View File
@@ -49,7 +49,22 @@ USER root
# Install runtime dependencies (libsndfile needed for audio processing on ARM64)
RUN apk add --no-cache bash openssl tzdata nodejs npm python3 py3-pip libsndfile && \
npm install -g npm@latest tar@latest
npm install -g npm@latest tar@7.5.7 glob@11.1.0 @isaacs/brace-expansion@5.0.1 && \
# SECURITY FIX: npm bundles tar, glob, and brace-expansion at multiple nested
# levels inside its dependency tree. `npm install -g <pkg>` only creates a
# SEPARATE global package, it does NOT replace npm's internal copies.
# We must find and replace EVERY copy inside npm's directory.
GLOBAL="$(npm root -g)" && \
find "$GLOBAL/npm" -type d -name "tar" -path "*/node_modules/tar" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "glob" -path "*/node_modules/glob" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "brace-expansion" -path "*/node_modules/@isaacs/brace-expansion" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done && \
npm cache clean --force
WORKDIR /app
# Copy the current directory contents into the container at /app
@@ -71,6 +86,20 @@ RUN NODEJS_WHEEL_NODE=$(find /usr/lib -path "*/nodejs_wheel/bin/node" 2>/dev/nul
RUN find /usr/lib -type f -path "*/tornado/test/*" -delete && \
find /usr/lib -type d -path "*/tornado/test" -delete
# SECURITY FIX: nodejs-wheel-binaries (pip package used by Prisma) bundles a complete
# npm with old vulnerable deps at /usr/lib/python3.*/site-packages/nodejs_wheel/.
# Patch every copy of tar, glob, and brace-expansion inside that tree.
RUN GLOBAL="$(npm root -g)" && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/tar" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/glob" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/@isaacs/brace-expansion" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done
# Install semantic_router and aurelio-sdk using script
# Convert Windows line endings to Unix and make executable
RUN sed -i 's/\r$//' docker/install_auto_router.sh && chmod +x docker/install_auto_router.sh && ./docker/install_auto_router.sh
+1 -4
View File
@@ -155,10 +155,7 @@ run_grype_scans() {
"CVE-2025-12781" # No fix available yet
"CVE-2025-11468" # No fix available yet
"CVE-2026-1299" # Python 3.13 email module header injection - not applicable, LiteLLM doesn't use BytesGenerator for email serialization
"GHSA-7h2j-956f-4vf2" # @isaacs/brace-expansion ReDoS - npm tooling dependency, not used in application runtime
"GHSA-hx9q-6w63-j58v" # orjson deep recursion - no fix available yet
"GHSA-8qq5-rm4j-mr97" # node-tar symlink poisoning - npm tooling dependency, tar CLI not exposed in application code
"GHSA-29xp-372q-xqph" # node-tar race condition - npm tooling dependency, tar CLI not exposed in application code
"CVE-2026-0775" # npm cli incorrect permission assignment - no fix available yet, npm is only used at build/prisma-generate time
)
# Build JSON array of allowlisted CVE IDs for jq
+12 -1
View File
@@ -6,7 +6,18 @@ WORKDIR /app
# Install Node.js and npm (adjust version as needed)
RUN apt-get update && apt-get install -y nodejs npm && \
npm install -g npm@latest tar@latest
npm install -g npm@latest tar@7.5.7 glob@11.1.0 @isaacs/brace-expansion@5.0.1 && \
GLOBAL="$(npm root -g)" && \
find "$GLOBAL/npm" -type d -name "tar" -path "*/node_modules/tar" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "glob" -path "*/node_modules/glob" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "brace-expansion" -path "*/node_modules/@isaacs/brace-expansion" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done && \
npm cache clean --force
# Copy the UI source into the container
COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
+25 -4
View File
@@ -50,7 +50,18 @@ USER root
# Install runtime dependencies
RUN apk add --no-cache bash openssl tzdata nodejs npm python3 py3-pip libsndfile && \
npm install -g npm@latest tar@latest
npm install -g npm@latest tar@7.5.7 glob@11.1.0 @isaacs/brace-expansion@5.0.1 && \
GLOBAL="$(npm root -g)" && \
find "$GLOBAL/npm" -type d -name "tar" -path "*/node_modules/tar" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "glob" -path "*/node_modules/glob" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find "$GLOBAL/npm" -type d -name "brace-expansion" -path "*/node_modules/@isaacs/brace-expansion" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done && \
npm cache clean --force
WORKDIR /app
# Copy the current directory contents into the container at /app
@@ -64,9 +75,19 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# Replace the nodejs-wheel-binaries bundled node with the system node (fixes CVE-2025-55130)
RUN NODEJS_WHEEL_NODE=$(find /usr/lib -path "*/nodejs_wheel/bin/node" 2>/dev/null) && \
if [ -n "$NODEJS_WHEEL_NODE" ]; then cp /usr/bin/node "$NODEJS_WHEEL_NODE"; fi
# SECURITY FIX: nodejs-wheel-binaries (pip package used by Prisma) bundles a complete
# npm with old vulnerable deps at /usr/lib/python3.*/site-packages/nodejs_wheel/.
# Patch every copy of tar, glob, and brace-expansion inside that tree.
RUN GLOBAL="$(npm root -g)" && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/tar" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/glob" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/@isaacs/brace-expansion" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done
# Install semantic_router and aurelio-sdk using script
# Convert Windows line endings to Unix and make executable
+26 -1
View File
@@ -62,7 +62,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
nodejs \
npm \
&& rm -rf /var/lib/apt/lists/* \
&& npm install -g npm@latest tar@latest
&& npm install -g npm@latest tar@7.5.7 glob@11.1.0 @isaacs/brace-expansion@5.0.1 \
&& GLOBAL="$(npm root -g)" \
&& find "$GLOBAL/npm" -type d -name "tar" -path "*/node_modules/tar" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done \
&& find "$GLOBAL/npm" -type d -name "glob" -path "*/node_modules/glob" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done \
&& find "$GLOBAL/npm" -type d -name "brace-expansion" -path "*/node_modules/@isaacs/brace-expansion" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done \
&& npm cache clean --force
WORKDIR /app
@@ -80,6 +91,20 @@ RUN pip install --no-cache-dir *.whl /wheels/* --no-index --find-links=/wheels/
rm -f *.whl && \
rm -rf /wheels
# SECURITY FIX: nodejs-wheel-binaries (pip package used by Prisma) bundles a complete
# npm with old vulnerable deps at /usr/lib/python3.*/site-packages/nodejs_wheel/.
# Patch every copy of tar, glob, and brace-expansion inside that tree.
RUN GLOBAL="$(npm root -g)" && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/tar" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/glob" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/@isaacs/brace-expansion" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done
# Generate prisma client and set permissions
# Convert Windows line endings to Unix for entrypoint scripts
RUN prisma generate && \
+25 -4
View File
@@ -104,7 +104,18 @@ RUN for i in 1 2 3; do \
&& for i in 1 2 3; do \
apk add --no-cache python3 py3-pip bash openssl tzdata nodejs npm supervisor && break || sleep 5; \
done \
&& npm install -g npm@latest tar@latest
&& npm install -g npm@latest tar@7.5.7 glob@11.1.0 @isaacs/brace-expansion@5.0.1 \
&& GLOBAL="$(npm root -g)" \
&& find "$GLOBAL/npm" -type d -name "tar" -path "*/node_modules/tar" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done \
&& find "$GLOBAL/npm" -type d -name "glob" -path "*/node_modules/glob" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done \
&& find "$GLOBAL/npm" -type d -name "brace-expansion" -path "*/node_modules/@isaacs/brace-expansion" | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done \
&& npm cache clean --force
# Copy artifacts from builder
COPY --from=builder /app/requirements.txt /app/requirements.txt
@@ -146,9 +157,19 @@ RUN pip install --no-index --find-links=/wheels/ -r requirements.txt && \
fi; \
fi
# Replace the nodejs-wheel-binaries bundled node with the system node (fixes CVE-2025-55130)
RUN NODEJS_WHEEL_NODE=$(find /usr/lib -path "*/nodejs_wheel/bin/node" 2>/dev/null) && \
if [ -n "$NODEJS_WHEEL_NODE" ]; then cp /usr/bin/node "$NODEJS_WHEEL_NODE"; fi
# SECURITY FIX: nodejs-wheel-binaries (pip package used by Prisma) bundles a complete
# npm with old vulnerable deps at /usr/lib/python3.*/site-packages/nodejs_wheel/.
# Patch every copy of tar, glob, and brace-expansion inside that tree.
RUN GLOBAL="$(npm root -g)" && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/tar" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/tar" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/glob" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/glob" "$d"; \
done && \
find /usr/lib -path "*/nodejs_wheel/*/node_modules/@isaacs/brace-expansion" -type d | while read d; do \
rm -rf "$d" && cp -rL "$GLOBAL/@isaacs/brace-expansion" "$d"; \
done
# Permissions, cleanup, and Prisma prep
# Convert Windows line endings to Unix for entrypoint scripts
@@ -227,6 +227,28 @@ response = litellm.completion(
)
```
## OAuth2/JWT Authentication
If your LiteLLM Proxy requires OAuth2/JWT authentication (e.g., Azure AD, Keycloak, Okta), the SDK can automatically obtain and refresh tokens for you.
```python
import litellm
from litellm.proxy_auth import AzureADCredential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=AzureADCredential(),
scope="api://my-litellm-proxy/.default"
)
litellm.api_base = "https://my-proxy.example.com"
response = litellm.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
[Learn more about SDK Proxy Authentication (OAuth2/JWT Auto-Refresh) →](../proxy_auth)
## Sending `tags` to LiteLLM Proxy
Tags allow you to categorize and track your API requests for monitoring, debugging, and analytics purposes. You can send tags as a list of strings to the LiteLLM Proxy using the `extra_body` parameter.
@@ -100,7 +100,7 @@ In cases where encounter other errors when apply Zscaler AI Guard, return exampl
}
}
```
## 6. Sending User Information to Zscaler AI Guard for Analysis (Optional)
## 6. Sending User Information to Zscaler AI Guard (Optional)
If you need to send end-user information to Zscaler AI Guard for analysis, you can set the configuration in the environment variables to True and include the relevant information in custom_headers on Zscaler AI Guard.
- To send user_api_key_alias:
@@ -133,4 +133,30 @@ curl -i http://localhost:8165/v1/chat/completions \
"zguard_policy_id": <the custom policy id>
}
}'
```
## 8. Set Custom Zscaler AI Guard Policy on Litellm Team OR Key Metadata (Optional)
In addition to setting `zguard_policy_id` in a request or the configuration file, you can also set it in the metadata for LiteLLM Team or Key. The `zguard_policy_id` is determined using the following order of precedence: request, Key, Team, config file. This logic is illustrated below:
```
user_api_key_metadata = metadata.get("user_api_key_metadata", {}) or {}
team_metadata = metadata.get("team_metadata", {}) or {}
policy_id = (
metadata.get("zguard_policy_id")
if "zguard_policy_id" in metadata
else (
user_api_key_metadata.get("zguard_policy_id")
if "zguard_policy_id" in user_api_key_metadata
else (
team_metadata.get("zguard_policy_id")
if "zguard_policy_id" in team_metadata
else self.policy_id
)
)
)
```
You can leverage this feature to apply multiple policies configured on the Zscaler AI Guard (ZGuard) to traffic from different applications. (Note: It is recommended to map policies using either Team or Key metadata, but not a mix of both.)
Example set in Team/Key Metadata, you can set From UI:
```
{"zguard_policy_id": 100}
```
+333
View File
@@ -0,0 +1,333 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# SDK Proxy Authentication (OAuth2/JWT Auto-Refresh)
Automatically obtain and refresh OAuth2/JWT tokens when using the LiteLLM Python SDK with a LiteLLM Proxy that requires JWT authentication.
## Overview
When your LiteLLM Proxy is protected by an OAuth2/OIDC provider (Azure AD, Keycloak, Okta, Auth0, etc.), your SDK clients need valid JWT tokens for every request. Instead of manually managing token lifecycle, `litellm.proxy_auth` handles this automatically:
- Obtains tokens from your identity provider
- Caches tokens to avoid unnecessary requests
- Refreshes tokens before they expire (60-second buffer)
- Injects `Authorization: Bearer <token>` headers into every request
## Quick Start
### Azure AD
<Tabs>
<TabItem value="default" label="DefaultAzureCredential">
Uses the [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity.defaultazurecredential) chain (environment variables, managed identity, Azure CLI, etc.):
```python
import litellm
from litellm.proxy_auth import AzureADCredential, ProxyAuthHandler
# One-time setup
litellm.proxy_auth = ProxyAuthHandler(
credential=AzureADCredential(), # uses DefaultAzureCredential
scope="api://my-litellm-proxy/.default"
)
litellm.api_base = "https://my-proxy.example.com"
# All requests now include Authorization headers automatically
response = litellm.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
</TabItem>
<TabItem value="client-secret" label="ClientSecretCredential">
Use a specific Azure AD app registration:
```python
import litellm
from azure.identity import ClientSecretCredential
from litellm.proxy_auth import AzureADCredential, ProxyAuthHandler
azure_cred = ClientSecretCredential(
tenant_id="your-tenant-id",
client_id="your-client-id",
client_secret="your-client-secret"
)
litellm.proxy_auth = ProxyAuthHandler(
credential=AzureADCredential(credential=azure_cred),
scope="api://my-litellm-proxy/.default"
)
litellm.api_base = "https://my-proxy.example.com"
response = litellm.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
</TabItem>
</Tabs>
**Required package:** `pip install azure-identity`
### Generic OAuth2 (Okta, Auth0, Keycloak, etc.)
Works with any OAuth2 provider that supports the `client_credentials` grant type:
```python
import litellm
from litellm.proxy_auth import GenericOAuth2Credential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=GenericOAuth2Credential(
client_id="your-client-id",
client_secret="your-client-secret",
token_url="https://your-idp.example.com/oauth2/token"
),
scope="litellm_proxy_api"
)
litellm.api_base = "https://my-proxy.example.com"
response = litellm.completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
### Custom Credential Provider
Implement the `TokenCredential` protocol to use any authentication mechanism:
```python
import time
import litellm
from litellm.proxy_auth import AccessToken, ProxyAuthHandler
class MyCustomCredential:
"""Any class with a get_token(scope) -> AccessToken method works."""
def get_token(self, scope: str) -> AccessToken:
# Your custom logic to obtain a token
token = my_auth_system.get_jwt(scope=scope)
return AccessToken(
token=token,
expires_on=int(time.time()) + 3600
)
litellm.proxy_auth = ProxyAuthHandler(
credential=MyCustomCredential(),
scope="my-scope"
)
```
## Supported Endpoints
Auth headers are automatically injected for:
| Endpoint | Function |
|----------|----------|
| Chat Completions | `litellm.completion()` / `litellm.acompletion()` |
| Embeddings | `litellm.embedding()` / `litellm.aembedding()` |
## How It Works
```
┌──────────┐ ┌──────────────────┐ ┌──────────────┐ ┌──────────────┐
│ Your │ │ ProxyAuthHandler │ │ Identity │ │ LiteLLM │
│ Code │────▶│ (token cache) │────▶│ Provider │ │ Proxy │
│ │ │ │◀────│ (Azure AD, │ │ │
│ │ │ │ │ Okta, etc) │ │ │
│ │ └────────┬─────────┘ └──────────────┘ │ │
│ │ │ Authorization: Bearer <token> │ │
│ │──────────────┼───────────────────────────────────▶│ │
│ │◀─────────────┼────────────────────────────────────│ │
└──────────┘ │ └──────────────┘
```
1. You set `litellm.proxy_auth` once at startup
2. On each SDK call (`completion()`, `embedding()`), the handler checks its cached token
3. If the token is missing or expires within 60 seconds, it requests a new one from your identity provider
4. The `Authorization: Bearer <token>` header is injected into the request
5. If token retrieval fails, a warning is logged and the request proceeds without auth headers
## API Reference
### ProxyAuthHandler
The main handler that manages the token lifecycle.
```python
from litellm.proxy_auth import ProxyAuthHandler
handler = ProxyAuthHandler(
credential=<TokenCredential>, # required - credential provider
scope="<oauth2-scope>" # required - OAuth2 scope to request
)
```
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `credential` | `TokenCredential` | Yes | A credential provider (AzureADCredential, GenericOAuth2Credential, or custom) |
| `scope` | `str` | Yes | The OAuth2 scope to request tokens for |
**Methods:**
| Method | Returns | Description |
|--------|---------|-------------|
| `get_token()` | `AccessToken` | Get a valid token, refreshing if needed |
| `get_auth_headers()` | `dict` | Get `{"Authorization": "Bearer <token>"}` headers |
### AzureADCredential
Wraps any `azure-identity` credential with lazy initialization.
```python
from litellm.proxy_auth import AzureADCredential
# Uses DefaultAzureCredential (recommended)
cred = AzureADCredential()
# Or wrap a specific azure-identity credential
from azure.identity import ManagedIdentityCredential
cred = AzureADCredential(credential=ManagedIdentityCredential())
```
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `credential` | Azure `TokenCredential` | No | An azure-identity credential. If `None`, uses `DefaultAzureCredential` |
### GenericOAuth2Credential
Standard OAuth2 client credentials flow for any provider.
```python
from litellm.proxy_auth import GenericOAuth2Credential
cred = GenericOAuth2Credential(
client_id="your-client-id",
client_secret="your-client-secret",
token_url="https://your-idp.com/oauth2/token"
)
```
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `client_id` | `str` | Yes | OAuth2 client ID |
| `client_secret` | `str` | Yes | OAuth2 client secret |
| `token_url` | `str` | Yes | Token endpoint URL |
### AccessToken
Dataclass representing an OAuth2 access token.
```python
from litellm.proxy_auth import AccessToken
token = AccessToken(
token="eyJhbG...", # JWT string
expires_on=1234567890 # Unix timestamp
)
```
### TokenCredential Protocol
Any class implementing this protocol can be used as a credential provider:
```python
from litellm.proxy_auth import AccessToken
class MyCredential:
def get_token(self, scope: str) -> AccessToken:
...
```
## Provider-Specific Examples
### Keycloak
```python
from litellm.proxy_auth import GenericOAuth2Credential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=GenericOAuth2Credential(
client_id="litellm-client",
client_secret="your-keycloak-client-secret",
token_url="https://keycloak.example.com/realms/your-realm/protocol/openid-connect/token"
),
scope="openid"
)
```
### Okta
```python
from litellm.proxy_auth import GenericOAuth2Credential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=GenericOAuth2Credential(
client_id="your-okta-client-id",
client_secret="your-okta-client-secret",
token_url="https://your-org.okta.com/oauth2/default/v1/token"
),
scope="litellm_api"
)
```
### Auth0
```python
from litellm.proxy_auth import GenericOAuth2Credential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=GenericOAuth2Credential(
client_id="your-auth0-client-id",
client_secret="your-auth0-client-secret",
token_url="https://your-tenant.auth0.com/oauth/token"
),
scope="https://my-proxy.example.com/api"
)
```
### Azure AD with Managed Identity
```python
from azure.identity import ManagedIdentityCredential
from litellm.proxy_auth import AzureADCredential, ProxyAuthHandler
litellm.proxy_auth = ProxyAuthHandler(
credential=AzureADCredential(
credential=ManagedIdentityCredential()
),
scope="api://my-litellm-proxy/.default"
)
```
## Combining with `use_litellm_proxy`
You can use `proxy_auth` together with [`use_litellm_proxy`](./providers/litellm_proxy#send-all-sdk-requests-to-litellm-proxy) to route all SDK requests through an authenticated proxy:
```python
import os
import litellm
from litellm.proxy_auth import AzureADCredential, ProxyAuthHandler
# Route all requests through the proxy
os.environ["LITELLM_PROXY_API_BASE"] = "https://my-proxy.example.com"
litellm.use_litellm_proxy = True
# Authenticate with OAuth2/JWT
litellm.proxy_auth = ProxyAuthHandler(
credential=AzureADCredential(),
scope="api://my-litellm-proxy/.default"
)
# This request goes through the proxy with automatic JWT auth
response = litellm.completion(
model="vertex_ai/gemini-2.0-flash-001",
messages=[{"role": "user", "content": "Hello!"}]
)
```
@@ -0,0 +1,43 @@
# Claude Code - Prompt Cache Routing
Claude's [Prompt Caching](https://platform.claude.com/docs/en/build-with-claude/prompt-caching) feature helps to optimize API usage through attempting to cache prompts and re-use cached prompts during subsequent API calls. This feature is used by Claude Code.
When LiteLLM [load balancing](../proxy/load_balancing.md) is enabled, to ensure this prompt caching feature still works with Claude Code, LiteLLM needs to be configured to use the `PromptCachingDeploymentCheck` pre-call check. This pre-call check will ensure that API calls that used prompt caching are remembered and that subsequent API calls that try to use that prompt caching are routed to the same model deployment where a cache write occurred.
## Set Up
1. Configure the router so that it uses the `PromptCachingDeploymentCheck` (via setting the `optional_pre_call_checks` property), and configure the models so that they can access multiple deployments of Claude; below, we show an example for multiple AWS accounts (referred to as `account-1` and `account-2`, using the `aws_profile_name` property):
```yaml
router_settings:
optional_pre_call_checks: ["prompt_caching"]
model_list:
- litellm_params:
model: us.anthropic.claude-sonnet-4-5-20250929-v1:0
aws_profile_name: account-1
aws_region_name: us-west-2
model_info:
litellm_provider: bedrock
model_name: us.anthropic.claude-sonnet-4-5-20250929-v1:0
- litellm_params:
model: us.anthropic.claude-sonnet-4-5-20250929-v1:0
aws_profile_name: account-2
aws_region_name: us-west-2
model_info:
litellm_provider: bedrock
model_name: us.anthropic.claude-sonnet-4-5-20250929-v1:0
```
2. Utilize Claude Code:
1. Launch Claude Code, which will do a warm-up API call that tries to cache its warm-up prompt and its system prompt.
2. Wait a few seconds, then quit Claude Code and re-open it.
3. You'll notice that the warm-up API call successfully gets a cache hit (if using Claude Code in an IDE like VS Code, ensure that you don't do anything between step 2.1 and 2.2 here, otherwise there may not be a cache hit):
1. Go to the [LiteLLM Request Logs page](../proxy/ui_logs.md) in the Admin UI
2. Click on the individual requests to see (a) the cache creation and cache read tokens; and (b) the Model ID. In particular, the API call from step 2.1 should show a cache write, and the API call from step 2.2 should show a cache read; in addition, the Model ID should be equal (meaning the API call is getting forwarded to the same AWS account).
## Related
- [Claude Code - Quickstart](./claude_responses_api.md)
- [Claude Code - Customer Tracking](./claude_code_customer_tracking.md)
- [Claude Code - Plugin Marketplace](./claude_code_plugin_marketplace.md)
- [Claude Code - WebSearch](./claude_code_websearch.md)
- [Proxy - Load Balancing](../proxy/load_balancing.md)
+2
View File
@@ -61,6 +61,8 @@
"mermaid": ">=11.10.0",
"gray-matter": "4.0.3",
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1",
"node-forge": ">=1.3.2",
"mdast-util-to-hast": ">=13.2.1",
"lodash-es": ">=4.17.23"
+2
View File
@@ -125,6 +125,7 @@ const sidebars = {
"tutorials/claude_responses_api",
"tutorials/claude_code_max_subscription",
"tutorials/claude_code_customer_tracking",
"tutorials/claude_code_prompt_cache_routing",
"tutorials/claude_code_websearch",
"tutorials/claude_mcp",
"tutorials/claude_non_anthropic_models",
@@ -223,6 +224,7 @@ const sidebars = {
label: "Configuration",
items: [
"set_keys",
"proxy_auth",
"caching/all_caches",
],
},
+3 -1
View File
@@ -11,6 +11,8 @@
"tsx": "^4.7.1"
},
"overrides": {
"glob": ">=11.1.0"
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1"
}
}
@@ -0,0 +1,8 @@
-- CreateIndex
CREATE INDEX "LiteLLM_VerificationToken_user_id_team_id_idx" ON "LiteLLM_VerificationToken"("user_id", "team_id");
-- CreateIndex
CREATE INDEX "LiteLLM_VerificationToken_team_id_idx" ON "LiteLLM_VerificationToken"("team_id");
-- CreateIndex
CREATE INDEX "LiteLLM_VerificationToken_budget_reset_at_expires_idx" ON "LiteLLM_VerificationToken"("budget_reset_at", "expires");
@@ -310,6 +310,16 @@ model LiteLLM_VerificationToken {
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
object_permission LiteLLM_ObjectPermissionTable? @relation(fields: [object_permission_id], references: [object_permission_id])
// SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
@@index([user_id, team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
@@index([team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
@@index([budget_reset_at, expires])
}
// Audit table for deleted keys - preserves spend and key information for historical tracking
+107 -6
View File
@@ -45,7 +45,14 @@ from litellm.llms.custom_httpx.http_handler import (
httpxSpecialProvider,
)
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
from litellm.types.integrations.datadog import *
from litellm.types.integrations.datadog import (
DD_ERRORS,
DD_MAX_BATCH_SIZE,
DataDogStatus,
DatadogInitParams,
DatadogPayload,
DatadogProxyFailureHookJsonMessage,
)
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
from litellm.types.utils import StandardLoggingPayload
@@ -85,12 +92,14 @@ class DataDogLogger(
"""
try:
verbose_logger.debug("Datadog: in init datadog logger")
self.is_mock_mode = should_use_datadog_mock()
if self.is_mock_mode:
create_mock_datadog_client()
verbose_logger.debug("[DATADOG MOCK] Datadog logger initialized in mock mode")
verbose_logger.debug(
"[DATADOG MOCK] Datadog logger initialized in mock mode"
)
#########################################################
# Handle datadog_params set as litellm.datadog_params
@@ -209,6 +218,96 @@ class DataDogLogger(
)
pass
async def async_post_call_failure_hook(
self,
request_data: dict,
original_exception: Exception,
user_api_key_dict: Any,
traceback_str: Optional[str] = None,
) -> Optional[Any]:
"""
Log proxy-level failures (e.g. 401 auth, DB connection errors) to Datadog.
Ensures failures that occur before or outside the LLM completion flow
(e.g. ConnectError during auth when DB is down) are visible in Datadog
alongside Prometheus.
"""
try:
from litellm.litellm_core_utils.litellm_logging import (
StandardLoggingPayloadSetup,
)
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
error_information = StandardLoggingPayloadSetup.get_error_information(
original_exception=original_exception,
traceback_str=traceback_str,
)
_code = error_information.get("error_code") or ""
status_code: Optional[int] = None
if _code and str(_code).strip().isdigit():
status_code = int(_code)
# Use project-standard sanitized user context when running in proxy
user_context: Dict[str, Any] = {}
try:
from litellm.proxy.litellm_pre_call_utils import (
LiteLLMProxyRequestSetup,
)
_meta = (
LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
user_api_key_dict=user_api_key_dict
)
)
user_context = dict(_meta) if isinstance(_meta, dict) else _meta
except Exception:
# Fallback if proxy not available (e.g. SDK-only): minimal safe fields
if hasattr(user_api_key_dict, "request_route"):
user_context["request_route"] = getattr(
user_api_key_dict, "request_route", None
)
if hasattr(user_api_key_dict, "team_id"):
user_context["team_id"] = getattr(
user_api_key_dict, "team_id", None
)
if hasattr(user_api_key_dict, "user_id"):
user_context["user_id"] = getattr(
user_api_key_dict, "user_id", None
)
if hasattr(user_api_key_dict, "end_user_id"):
user_context["end_user_id"] = getattr(
user_api_key_dict, "end_user_id", None
)
message_payload: DatadogProxyFailureHookJsonMessage = {
"exception": error_information.get("error_message")
or str(original_exception),
"error_class": error_information.get("error_class")
or original_exception.__class__.__name__,
"status_code": status_code,
"traceback": error_information.get("traceback") or "",
"user_api_key_dict": user_context,
}
dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(),
hostname=get_datadog_hostname(),
message=safe_dumps(message_payload),
service=get_datadog_service(),
status=DataDogStatus.ERROR,
)
self._add_trace_context_to_payload(dd_payload=dd_payload)
self.log_queue.append(dd_payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"Datadog: async_post_call_failure_hook - {str(e)}\n{traceback.format_exc()}"
)
return None
async def async_send_batch(self):
"""
Sends the in memory logs queue to datadog api
@@ -230,9 +329,11 @@ class DataDogLogger(
len(self.log_queue),
self.intake_url,
)
if self.is_mock_mode:
verbose_logger.debug("[DATADOG MOCK] Mock mode enabled - API calls will be intercepted")
verbose_logger.debug(
"[DATADOG MOCK] Mock mode enabled - API calls will be intercepted"
)
response = await self.async_send_compressed_data(self.log_queue)
if response.status_code == 413:
@@ -1,6 +1,6 @@
import base64
import time
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
from litellm.types.llms.openai import (
ChatCompletionAssistantContentValue,
@@ -326,10 +326,22 @@ class ChunkProcessor:
thinking_blocks: List[
Union["ChatCompletionThinkingBlock", "ChatCompletionRedactedThinkingBlock"]
] = []
combined_thinking_text: Optional[str] = None
data: Optional[str] = None
signature: Optional[str] = None
type: Literal["thinking", "redacted_thinking"] = "thinking"
current_thinking_text_parts: List[str] = []
current_signature: Optional[str] = None
def _flush_thinking_block() -> None:
nonlocal current_thinking_text_parts, current_signature
if len(current_thinking_text_parts) > 0 and current_signature:
thinking_blocks.append(
ChatCompletionThinkingBlock(
type="thinking",
thinking="".join(current_thinking_text_parts),
signature=current_signature,
)
)
current_thinking_text_parts = []
current_signature = None
for chunk in chunks:
choices = chunk["choices"]
for choice in choices:
@@ -339,33 +351,25 @@ class ChunkProcessor:
for thinking_block in thinking:
thinking_type = thinking_block.get("type", None)
if thinking_type and thinking_type == "redacted_thinking":
type = "redacted_thinking"
data = thinking_block.get("data", None)
_flush_thinking_block()
redacted_data = thinking_block.get("data", None)
if redacted_data:
thinking_blocks.append(
ChatCompletionRedactedThinkingBlock(
type="redacted_thinking",
data=redacted_data,
)
)
else:
type = "thinking"
thinking_text = thinking_block.get("thinking", None)
if thinking_text:
if combined_thinking_text is None:
combined_thinking_text = ""
combined_thinking_text += thinking_text
current_thinking_text_parts.append(thinking_text)
signature = thinking_block.get("signature", None)
if signature:
current_signature = signature
_flush_thinking_block()
if combined_thinking_text and type == "thinking" and signature:
thinking_blocks.append(
ChatCompletionThinkingBlock(
type=type,
thinking=combined_thinking_text,
signature=signature,
)
)
elif data and type == "redacted_thinking":
thinking_blocks.append(
ChatCompletionRedactedThinkingBlock(
type=type,
data=data,
)
)
_flush_thinking_block()
if len(thinking_blocks) > 0:
return thinking_blocks
@@ -46,8 +46,12 @@ class AnthropicMessagesConfig(BaseAnthropicMessagesConfig):
"thinking",
"context_management",
"output_format",
<<<<<<< litellm_v1_messages_claude_4_6
"inference_geo",
"speed",
=======
"output_config",
>>>>>>> main
# TODO: Add Anthropic `metadata` support
# "metadata",
]
+36 -1
View File
@@ -218,6 +218,7 @@ class OCIChatConfig(BaseConfig):
"parallel_tool_calls": False,
"audio": False,
"web_search_options": False,
"response_format": "responseFormat",
}
# Cohere and Gemini use the same parameter mapping as GENERIC
@@ -269,6 +270,9 @@ class OCIChatConfig(BaseConfig):
adapted_params[alias] = value
if alias == "responseFormat":
adapted_params["response_format"] = value
return adapted_params
def _sign_with_oci_signer(
@@ -673,6 +677,36 @@ class OCIChatConfig(BaseConfig):
selected_params["tools"] = adapt_tool_definition_to_oci_standard( # type: ignore[assignment]
selected_params["tools"], vendor # type: ignore[arg-type]
)
# Transform response_format type to OCI uppercase format
if "responseFormat" in selected_params:
rf = selected_params["responseFormat"]
if isinstance(rf, dict) and "type" in rf:
rf_payload = dict(rf)
selected_params["responseFormat"] = rf_payload
response_type = rf_payload["type"]
schema_payload: Optional[Any] = None
if "json_schema" in rf_payload:
raw_schema_payload = rf_payload.pop("json_schema")
if isinstance(raw_schema_payload, dict):
schema_payload = dict(raw_schema_payload)
else:
schema_payload = raw_schema_payload
if schema_payload is not None:
rf_payload["jsonSchema"] = schema_payload
if vendor == OCIVendors.COHERE:
# Cohere expects lower-case type values
rf_payload["type"] = response_type
else:
format_type = response_type.upper()
if format_type == "JSON":
format_type = "JSON_OBJECT"
rf_payload["type"] = format_type
return selected_params
def adapt_messages_to_cohere_standard(self, messages: List[AllMessageValues]) -> List[CohereMessage]:
@@ -806,11 +840,12 @@ class OCIChatConfig(BaseConfig):
# Create Cohere-specific chat request
optional_cohere_params = self._get_optional_params(OCIVendors.COHERE, optional_params)
chat_request = CohereChatRequest(
apiFormat="COHERE",
message=self._extract_text_content(user_messages[-1]["content"]),
chatHistory=self.adapt_messages_to_cohere_standard(messages),
**self._get_optional_params(OCIVendors.COHERE, optional_params)
**optional_cohere_params
)
data = OCICompletionPayload(
+34 -11
View File
@@ -269,26 +269,27 @@ class OpenAIVideoConfig(BaseVideoConfig):
) -> Tuple[str, Dict]:
"""
Transform the video list request for OpenAI API.
OpenAI API expects the following request:
- GET /v1/videos
"""
# Use the api_base directly for video list
url = api_base
# Prepare query parameters
params = {}
if after is not None:
params["after"] = after
# Decode the wrapped video ID back to the original provider ID
params["after"] = extract_original_video_id(after)
if limit is not None:
params["limit"] = str(limit)
if order is not None:
params["order"] = order
# Add any extra query parameters
if extra_query:
params.update(extra_query)
return url, params
def transform_video_list_response(
@@ -296,18 +297,40 @@ class OpenAIVideoConfig(BaseVideoConfig):
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
custom_llm_provider: Optional[str] = None,
) -> Dict[str,str]:
) -> Dict[str, str]:
response_data = raw_response.json()
if custom_llm_provider and "data" in response_data:
for video_obj in response_data.get("data", []):
if isinstance(video_obj, dict) and "id" in video_obj:
video_obj["id"] = encode_video_id_with_provider(
video_obj["id"],
custom_llm_provider,
video_obj.get("model")
video_obj["id"],
custom_llm_provider,
video_obj.get("model"),
)
# Encode pagination cursor IDs so they remain consistent
# with the wrapped data[].id format
data_list = response_data.get("data", [])
if response_data.get("first_id"):
first_model = None
if data_list and isinstance(data_list[0], dict):
first_model = data_list[0].get("model")
response_data["first_id"] = encode_video_id_with_provider(
response_data["first_id"],
custom_llm_provider,
first_model,
)
if response_data.get("last_id"):
last_model = None
if data_list and isinstance(data_list[-1], dict):
last_model = data_list[-1].get("model")
response_data["last_id"] = encode_video_id_with_provider(
response_data["last_id"],
custom_llm_provider,
last_model,
)
return response_data
def transform_video_delete_request(
@@ -56,34 +56,36 @@ class VertexAIAnthropicConfig(AnthropicConfig):
) -> None:
"""
Add context_management beta headers to the beta_set.
- If any edit has type "compact_20260112", add compact-2026-01-12 header
- For all other edits, add context-management-2025-06-27 header
Args:
beta_set: Set of beta headers to modify in-place
context_management: The context_management dict from optional_params
"""
from litellm.types.llms.anthropic import ANTHROPIC_BETA_HEADER_VALUES
edits = context_management.get("edits", [])
has_compact = False
has_other = False
for edit in edits:
edit_type = edit.get("type", "")
if edit_type == "compact_20260112":
has_compact = True
else:
has_other = True
# Add compact header if any compact edits exist
if has_compact:
beta_set.add(ANTHROPIC_BETA_HEADER_VALUES.COMPACT_2026_01_12.value)
# Add context management header if any other edits exist
if has_other:
beta_set.add(ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value)
beta_set.add(
ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value
)
def transform_request(
self,
@@ -102,10 +104,10 @@ class VertexAIAnthropicConfig(AnthropicConfig):
)
data.pop("model", None) # vertex anthropic doesn't accept 'model' parameter
# VertexAI doesn't support output_format parameter, remove it if present
data.pop("output_format", None)
tools = optional_params.get("tools")
tool_search_used = self.is_tool_search_used(tools)
auto_betas = self.get_anthropic_beta_list(
@@ -119,16 +121,30 @@ class VertexAIAnthropicConfig(AnthropicConfig):
beta_set = set(auto_betas)
if tool_search_used:
beta_set.add("tool-search-tool-2025-10-19") # Vertex requires this header for tool search
beta_set.add(
"tool-search-tool-2025-10-19"
) # Vertex requires this header for tool search
# Add context_management beta headers (compact and/or context-management)
context_management = optional_params.get("context_management")
if context_management:
self._add_context_management_beta_headers(beta_set, context_management)
extra_headers = optional_params.get("extra_headers") or {}
anthropic_beta_value = extra_headers.get("anthropic-beta", "")
if isinstance(anthropic_beta_value, str) and anthropic_beta_value:
for beta in anthropic_beta_value.split(","):
beta = beta.strip()
if beta:
beta_set.add(beta)
elif isinstance(anthropic_beta_value, list):
beta_set.update(anthropic_beta_value)
data.pop("extra_headers", None)
if beta_set:
data["anthropic_beta"] = list(beta_set)
return data
def map_openai_params(
@@ -148,7 +164,7 @@ class VertexAIAnthropicConfig(AnthropicConfig):
original_model = model
if "response_format" in non_default_params:
model = "claude-3-sonnet-20240229" # Use a model that will use tool-based approach
# Call parent method with potentially modified model name
optional_params = super().map_openai_params(
non_default_params=non_default_params,
@@ -156,10 +172,10 @@ class VertexAIAnthropicConfig(AnthropicConfig):
model=model,
drop_params=drop_params,
)
# Restore original model name for any other processing
model = original_model
return optional_params
def transform_response(
@@ -28540,6 +28540,193 @@
"supports_function_calling": true,
"supports_tool_choice": true
},
"vercel_ai_gateway/anthropic/claude-3-5-sonnet": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_tokens": 8192,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-3-5-sonnet-20241022": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_tokens": 8192,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-3-7-sonnet": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-haiku-4.5": {
"cache_creation_input_token_cost": 1.25e-06,
"cache_read_input_token_cost": 1e-07,
"input_cost_per_token": 1e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 5e-06,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4": {
"cache_creation_input_token_cost": 1.875e-05,
"cache_read_input_token_cost": 1.5e-06,
"input_cost_per_token": 1.5e-05,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 32000,
"max_tokens": 32000,
"mode": "chat",
"output_cost_per_token": 7.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.1": {
"cache_creation_input_token_cost": 1.875e-05,
"cache_read_input_token_cost": 1.5e-06,
"input_cost_per_token": 1.5e-05,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 32000,
"max_tokens": 32000,
"mode": "chat",
"output_cost_per_token": 7.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.5": {
"cache_creation_input_token_cost": 6.25e-06,
"cache_read_input_token_cost": 5e-07,
"input_cost_per_token": 5e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 2.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.6": {
"cache_creation_input_token_cost": 6.25e-06,
"cache_read_input_token_cost": 5e-07,
"input_cost_per_token": 5e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 2.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-sonnet-4": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-sonnet-4.5": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 1000000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/cohere/command-a": {
"input_cost_per_token": 2.5e-06,
"litellm_provider": "vercel_ai_gateway",
@@ -28549,7 +28736,8 @@
"mode": "chat",
"output_cost_per_token": 1e-05,
"supports_function_calling": true,
"supports_tool_choice": true
"supports_tool_choice": true,
"supports_response_schema": true
},
"vercel_ai_gateway/cohere/command-r": {
"input_cost_per_token": 1.5e-07,
@@ -92,14 +92,34 @@ class ZscalerAIGuard(CustomGuardrail):
Raises:
Exception: If content is blocked by Zscaler AI Guard
"""
texts = inputs.get("texts", [])
try:
verbose_proxy_logger.debug(f"ZscalerAIGuard: Checking {len(texts)} text(s)")
metadata = request_data.get("metadata", {})
custom_policy_id = request_data.get("metadata", {}).get(
"zguard_policy_id", self.policy_id
user_api_key_metadata = metadata.get("user_api_key_metadata", {}) or {}
team_metadata = metadata.get("team_metadata", {}) or {}
# Precedence for policy_id:
# 1. metadata.zguard_policy_id # request level
# 2. user_api_key_metadata.zguard_policy_id # Key level
# 3. team_metadata.zguard_policy_id # Team level
# 4. self.policy_id (from environment) # Global
policy_id = (
metadata.get("zguard_policy_id")
if "zguard_policy_id" in metadata
else (
user_api_key_metadata.get("zguard_policy_id")
if "zguard_policy_id" in user_api_key_metadata
else (
team_metadata.get("zguard_policy_id")
if "zguard_policy_id" in team_metadata
else self.policy_id
)
)
)
verbose_proxy_logger.debug(f"custom_policy_id: {custom_policy_id}")
verbose_proxy_logger.info(f"policy_id applied: {policy_id}")
kwargs = {}
if self.send_user_api_key_alias:
@@ -116,27 +136,29 @@ class ZscalerAIGuard(CustomGuardrail):
)
verbose_proxy_logger.debug(f"inside apply_guardrail kwargs: {kwargs}")
# Check each text (Zscaler processes one at a time)
for text in texts:
zscaler_ai_guard_result = None
direction = "OUT" if input_type == "response" else "IN"
verbose_proxy_logger.debug(f"direction: {direction}")
# Concatenate all texts and send to Zscaler AI Guard
if texts:
concatenated_text = " ".join(texts)
zscaler_ai_guard_result = await self.make_zscaler_ai_guard_api_call(
zscaler_ai_guard_url=self.zscaler_ai_guard_url,
api_key=self.api_key,
policy_id=self.policy_id,
direction="IN",
content=text,
policy_id=policy_id,
direction=direction,
content=concatenated_text,
**kwargs,
)
if (
zscaler_ai_guard_result
and zscaler_ai_guard_result.get("action") == "BLOCK"
):
blocking_info = zscaler_ai_guard_result.get(
"zscaler_ai_guard_response"
)
error_message = f"Content blocked by Zscaler AI Guard: {self.extract_blocking_info(blocking_info)}"
raise Exception(error_message)
if (
zscaler_ai_guard_result
and zscaler_ai_guard_result.get("action") == "BLOCK"
):
blocking_info = zscaler_ai_guard_result.get(
"zscaler_ai_guard_response"
)
error_message = f"Content blocked by Zscaler AI Guard: {self.extract_blocking_info(blocking_info)}"
raise Exception(error_message)
except Exception as e:
verbose_proxy_logger.error(
"ZscalerAIGuard: Failed to apply guardrail: %s", str(e)
@@ -216,7 +216,14 @@ def _update_metadata_field(updated_kv: dict, field_name: str) -> None:
field_name: Name of the metadata field being updated
"""
if field_name in LiteLLM_ManagementEndpoint_MetadataFields_Premium:
_premium_user_check()
value = updated_kv.get(field_name)
# Skip the premium check for empty collections ([] or {}).
# The UI sends these as defaults even when the user hasn't configured
# any enterprise features (see issue #20304). However, we still
# proceed with the update so that users can intentionally clear a
# previously-set field by sending an empty list/dict.
if value is not None and value != [] and value != {}:
_premium_user_check()
if field_name in updated_kv and updated_kv[field_name] is not None:
# remove field from updated_kv
+10
View File
@@ -308,6 +308,16 @@ model LiteLLM_VerificationToken {
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
object_permission LiteLLM_ObjectPermissionTable? @relation(fields: [object_permission_id], references: [object_permission_id])
// SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
@@index([user_id, team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
@@index([team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
@@index([budget_reset_at, expires])
}
// Audit table for deleted keys - preserves spend and key information for historical tracking
@@ -1,4 +1,3 @@
import copy
import hashlib
import json
import secrets
@@ -642,6 +641,34 @@ def _sanitize_request_body_for_spend_logs_payload(
return {k: _sanitize_value(v) for k, v in request_body.items()}
def _convert_to_json_serializable_dict(obj: Any) -> Any:
"""
Convert object to JSON-serializable dict, handling Pydantic models safely.
This avoids pickle-based deepcopy which fails on Pydantic v2 models
containing _thread.RLock objects.
Args:
obj: Object to convert (dict, list, Pydantic model, or primitive)
Returns:
JSON-serializable version of the object
"""
if isinstance(obj, BaseModel):
# Use Pydantic's model_dump() instead of pickle
return obj.model_dump()
elif isinstance(obj, dict):
return {k: _convert_to_json_serializable_dict(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [_convert_to_json_serializable_dict(item) for item in obj]
elif hasattr(obj, "__dict__"):
# Handle objects with __dict__ attribute
return _convert_to_json_serializable_dict(obj.__dict__)
else:
# Primitives (str, int, float, bool, None) pass through
return obj
def _get_proxy_server_request_for_spend_logs_payload(
metadata: dict,
litellm_params: dict,
@@ -649,7 +676,7 @@ def _get_proxy_server_request_for_spend_logs_payload(
) -> str:
"""
Only store if _should_store_prompts_and_responses_in_spend_logs() is True
If turn_off_message_logging is enabled, redact messages in the request body.
"""
if _should_store_prompts_and_responses_in_spend_logs():
@@ -674,9 +701,9 @@ def _get_proxy_server_request_for_spend_logs_payload(
),
}
# If redaction is enabled, deep copy request body before redacting
# If redaction is enabled, convert to serializable dict before redacting
if should_redact_message_logging(model_call_details=model_call_details):
_request_body = copy.deepcopy(_request_body)
_request_body = _convert_to_json_serializable_dict(_request_body)
perform_redaction(model_call_details=_request_body, result=None)
_request_body = _sanitize_request_body_for_spend_logs_payload(_request_body)
@@ -736,9 +763,9 @@ def _get_response_for_spend_logs_payload(
),
}
# If redaction is enabled, deep copy response before redacting
# If redaction is enabled, convert to serializable dict before redacting
if should_redact_message_logging(model_call_details=model_call_details):
response_obj = copy.deepcopy(response_obj)
response_obj = _convert_to_json_serializable_dict(response_obj)
response_obj = perform_redaction(model_call_details={}, result=response_obj)
sanitized_wrapper = _sanitize_request_body_for_spend_logs_payload(
@@ -88,6 +88,8 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
self._pending_tool_events: List[BaseLiteLLMOpenAIResponseObject] = []
self._tool_output_index_by_call_id: dict[str, int] = {}
self._tool_args_by_call_id: dict[str, str] = {}
self._tool_call_id_by_index: dict[int, str] = {}
self._ambiguous_tool_call_indexes: set[int] = set()
self._next_tool_output_index: int = 1 # output_index=0 reserved for the message item
self._final_tool_events_queued: bool = False
self._sequence_number: int = 0
@@ -111,6 +113,19 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
self._tool_output_index_by_call_id[call_id] = idx
return idx
def _normalize_tool_call_index(self, tool_call: object) -> Optional[int]:
idx_raw = (
tool_call.get("index")
if isinstance(tool_call, dict)
else getattr(tool_call, "index", None)
)
if idx_raw is None:
return None
try:
return int(idx_raw)
except (TypeError, ValueError):
return None
def _is_reasoning_end(self, chunk):
delta = chunk.choices[0].delta
@@ -143,10 +158,28 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
return
for tc in tool_calls:
tc_index = self._normalize_tool_call_index(tc)
call_id_raw = tc.get("id") if isinstance(tc, dict) else getattr(tc, "id", None)
if not call_id_raw:
call_id = ""
if call_id_raw:
call_id = str(call_id_raw)
if tc_index is not None:
existing_call_id = self._tool_call_id_by_index.get(tc_index)
if existing_call_id is not None and existing_call_id != call_id:
# Reusing the same index for multiple call_ids is ambiguous for id-less deltas.
# Guard against silent misrouting by disabling index fallback for this index.
self._ambiguous_tool_call_indexes.add(tc_index)
self._tool_call_id_by_index[tc_index] = call_id
elif tc_index is not None:
if tc_index in self._ambiguous_tool_call_indexes:
continue
mapped_call_id = self._tool_call_id_by_index.get(tc_index)
if mapped_call_id:
call_id = mapped_call_id
if not call_id:
continue
call_id = str(call_id_raw)
fn = tc.get("function") if isinstance(tc, dict) else getattr(tc, "function", None)
fn_name = ""
@@ -61,9 +61,10 @@ class PromptCachingDeploymentCheck(CustomLogger):
if (
call_type != CallTypes.completion.value
and call_type != CallTypes.acompletion.value
and call_type != CallTypes.anthropic_messages.value
): # only use prompt caching for completion calls
verbose_logger.debug(
"litellm.router_utils.pre_call_checks.prompt_caching_deployment_check: skipping adding model id to prompt caching cache, CALL TYPE IS NOT COMPLETION"
"litellm.router_utils.pre_call_checks.prompt_caching_deployment_check: skipping adding model id to prompt caching cache, CALL TYPE IS NOT COMPLETION or ANTHROPIC MESSAGE"
)
return
+1
View File
@@ -362,6 +362,7 @@ class AnthropicMessagesRequestOptionalParams(TypedDict, total=False):
container: Optional[Dict[str, Any]] # Container config with skills for code execution
output_format: Optional[AnthropicOutputSchema] # Structured outputs support
speed: Optional[str] # Fast mode support for Opus models
output_config: Optional[AnthropicOutputConfig] # Configuration for Claude's output behavior
class AnthropicMessagesRequest(AnthropicMessagesRequestOptionalParams, total=False):
+32 -26
View File
@@ -102,6 +102,7 @@ class OCIChatRequestPayload(BaseModel):
seed: Optional[int] = None
frequencyPenalty: Optional[float] = None
presencePenalty: Optional[float] = None
responseFormat: Optional[Dict[str, Any]] = None
class OCIServingMode(BaseModel):
@@ -125,14 +126,14 @@ class OCICompletionPayload(BaseModel):
class OCICompletionTokenDetails(BaseModel):
"""Completion token details in the OCI response."""
acceptedPredictionTokens: int
reasoningTokens: int
acceptedPredictionTokens: Optional[int] = None
reasoningTokens: Optional[int] = None
class OCIPromptTokensDetails(BaseModel):
"""Prompt token details in the OCI response."""
cachedTokens: int
cachedTokens: Optional[int] = None
class OCIResponseUsage(BaseModel):
@@ -205,40 +206,40 @@ class CohereStreamChunk(BaseModel):
class CohereMessage(BaseModel):
"""Base model for Cohere messages."""
role: str
message: str
message: Optional[str] = None
toolCalls: Optional[List[CohereToolCall]] = None
class CohereUserMessage(CohereMessage):
"""User message in Cohere chat."""
role: Literal["USER"] = "USER"
class CohereChatBotMessage(CohereMessage):
"""Chatbot message in Cohere chat."""
role: Literal["CHATBOT"] = "CHATBOT"
class CohereSystemMessage(CohereMessage):
"""System message in Cohere chat."""
role: Literal["SYSTEM"] = "SYSTEM"
class CohereToolMessage(CohereMessage):
"""Tool message in Cohere chat."""
role: Literal["TOOL"] = "TOOL"
toolCallId: str
class CohereParameterDefinition(BaseModel):
"""Parameter definition for Cohere tools."""
description: str
type: str
isRequired: bool = False
@@ -246,7 +247,7 @@ class CohereParameterDefinition(BaseModel):
class CohereTool(BaseModel):
"""Tool definition for Cohere."""
name: str
description: str
parameterDefinitions: Dict[str, CohereParameterDefinition]
@@ -254,38 +255,44 @@ class CohereTool(BaseModel):
class CohereToolCall(BaseModel):
"""Tool call made by Cohere model."""
name: str
parameters: Dict[str, Any]
class CohereToolResult(BaseModel):
"""Result of a tool call."""
callId: str
result: str
class CohereResponseFormat(BaseModel):
"""Response format for Cohere."""
type: str
class CohereResponseTextFormat(CohereResponseFormat):
"""Text response format for Cohere."""
type: Literal["text"] = "text"
class CohereResponseJSONSchemaFormat(CohereResponseFormat):
"""JSON schema response format for Cohere."""
type: Literal["json_schema"] = "json_schema"
jsonSchema: Dict[str, Any]
class CohereChatRequest(BaseModel):
"""Cohere chat request model."""
# Required fields
message: str
apiFormat: Literal["COHERE"] = "COHERE"
# Optional fields
chatHistory: Optional[List[CohereMessage]] = None
maxTokens: Optional[int] = None
@@ -298,7 +305,7 @@ class CohereChatRequest(BaseModel):
seed: Optional[int] = None
tools: Optional[List[CohereTool]] = None
toolChoice: Optional[Union[str, Dict[str, Any]]] = None
responseFormat: Optional[CohereResponseFormat] = None
responseFormat: Optional[Union[CohereResponseTextFormat, CohereResponseJSONSchemaFormat, CohereResponseFormat]] = None
preambleOverride: Optional[str] = None
documents: Optional[List[Dict[str, Any]]] = None
searchQueriesOnly: Optional[bool] = None
@@ -318,7 +325,7 @@ class CohereChatRequest(BaseModel):
class CohereUsage(BaseModel):
"""Usage information for Cohere response."""
promptTokens: int
completionTokens: int
totalTokens: int
@@ -328,7 +335,7 @@ class CohereUsage(BaseModel):
class CohereCitation(BaseModel):
"""Citation in Cohere response."""
start: int
end: int
text: str
@@ -337,19 +344,19 @@ class CohereCitation(BaseModel):
class CohereSearchQuery(BaseModel):
"""Search query generated by Cohere."""
text: str
generation_id: str
class CohereChatResponse(BaseModel):
"""Cohere chat response model."""
# Required fields
text: str
apiFormat: Literal["COHERE"] = "COHERE"
finishReason: Literal["COMPLETE", "ERROR_TOXIC", "ERROR_LIMIT", "ERROR", "USER_CANCEL", "MAX_TOKENS"]
# Optional fields
chatHistory: Optional[List[CohereMessage]] = None
citations: Optional[List[CohereCitation]] = None
@@ -364,7 +371,7 @@ class CohereChatResponse(BaseModel):
class CohereChatDetails(BaseModel):
"""Chat details for Cohere request."""
compartmentId: str
servingMode: OCIServingMode
chatRequest: CohereChatRequest
@@ -372,8 +379,7 @@ class CohereChatDetails(BaseModel):
class CohereChatResult(BaseModel):
"""Complete Cohere chat result."""
modelId: str
modelVersion: str
chatResponse: CohereChatResponse
+189 -1
View File
@@ -28540,6 +28540,193 @@
"supports_function_calling": true,
"supports_tool_choice": true
},
"vercel_ai_gateway/anthropic/claude-3-5-sonnet": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_tokens": 8192,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-3-5-sonnet-20241022": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_tokens": 8192,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-3-7-sonnet": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-haiku-4.5": {
"cache_creation_input_token_cost": 1.25e-06,
"cache_read_input_token_cost": 1e-07,
"input_cost_per_token": 1e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 5e-06,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4": {
"cache_creation_input_token_cost": 1.875e-05,
"cache_read_input_token_cost": 1.5e-06,
"input_cost_per_token": 1.5e-05,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 32000,
"max_tokens": 32000,
"mode": "chat",
"output_cost_per_token": 7.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.1": {
"cache_creation_input_token_cost": 1.875e-05,
"cache_read_input_token_cost": 1.5e-06,
"input_cost_per_token": 1.5e-05,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 32000,
"max_tokens": 32000,
"mode": "chat",
"output_cost_per_token": 7.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.5": {
"cache_creation_input_token_cost": 6.25e-06,
"cache_read_input_token_cost": 5e-07,
"input_cost_per_token": 5e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 2.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-opus-4.6": {
"cache_creation_input_token_cost": 6.25e-06,
"cache_read_input_token_cost": 5e-07,
"input_cost_per_token": 5e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 2.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-sonnet-4": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 200000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/anthropic/claude-sonnet-4.5": {
"cache_creation_input_token_cost": 3.75e-06,
"cache_read_input_token_cost": 3e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "vercel_ai_gateway",
"max_input_tokens": 1000000,
"max_output_tokens": 64000,
"max_tokens": 64000,
"mode": "chat",
"output_cost_per_token": 1.5e-05,
"supports_assistant_prefill": true,
"supports_computer_use": true,
"supports_function_calling": true,
"supports_prompt_caching": true,
"supports_reasoning": true,
"supports_tool_choice": true,
"supports_vision": true
},
"vercel_ai_gateway/cohere/command-a": {
"input_cost_per_token": 2.5e-06,
"litellm_provider": "vercel_ai_gateway",
@@ -28549,7 +28736,8 @@
"mode": "chat",
"output_cost_per_token": 1e-05,
"supports_function_calling": true,
"supports_tool_choice": true
"supports_tool_choice": true,
"supports_response_schema": true
},
"vercel_ai_gateway/cohere/command-r": {
"input_cost_per_token": 1.5e-07,
+3 -1
View File
@@ -11,6 +11,8 @@
"jest": "^29.7.0"
},
"overrides": {
"glob": ">=11.1.0"
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1"
}
}
+5
View File
@@ -1,4 +1,9 @@
# LITELLM PROXY DEPENDENCIES #
# Security: explicit pins for transitive deps (CVE fixes)
urllib3>=2.6.0 # CVE-2025-66471, CVE-2025-66418, CVE-2026-21441
tornado>=6.5.3 # CVE-2025-67725, CVE-2025-67726, CVE-2025-67724
filelock>=3.20.1 # CVE-2025-68146
anyio==4.8.0 # openai + http req.
httpx==0.28.1
openai==2.9.0 # openai req.
+10
View File
@@ -310,6 +310,16 @@ model LiteLLM_VerificationToken {
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
object_permission LiteLLM_ObjectPermissionTable? @relation(fields: [object_permission_id], references: [object_permission_id])
// SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
@@index([user_id, team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
@@index([team_id])
// SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
@@index([budget_reset_at, expires])
}
// Audit table for deleted keys - preserves spend and key information for historical tracking
+128 -1
View File
@@ -116,4 +116,131 @@ def test_extract_blocking_info():
blocking_info = guardrail.extract_blocking_info(response)
assert blocking_info["transactionId"] == "12345"
assert blocking_info["blockingDetectors"] == ["detector1"]
assert blocking_info["blockingDetectors"] == ["detector1"]
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_apply_guardrail_text_concatenation(mock_api_call):
"""
Test that `apply_guardrail` correctly concatenates texts.
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["Hello", "world"]}
request_data = {}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
call_args = mock_api_call.call_args
assert call_args.kwargs["content"] == "Hello world"
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_policy_id_from_request_metadata(mock_api_call):
"""
Test policy_id is picked from request metadata (highest precedence).
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["test"]}
request_data = {
"metadata": {
"zguard_policy_id": 1,
"user_api_key_metadata": {"zguard_policy_id": 2},
"team_metadata": {"zguard_policy_id": 3},
}
}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
assert mock_api_call.call_args.kwargs["policy_id"] == 1
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_policy_id_from_user_api_key_metadata(mock_api_call):
"""
Test policy_id is picked from user_api_key_metadata (2nd precedence).
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["test"]}
request_data = {
"metadata": {
"user_api_key_metadata": {"zguard_policy_id": 2},
"team_metadata": {"zguard_policy_id": 3},
}
}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
assert mock_api_call.call_args.kwargs["policy_id"] == 2
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_policy_id_from_team_metadata(mock_api_call):
"""
Test policy_id is picked from team_metadata (3rd precedence).
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["test"]}
request_data = {"metadata": {"team_metadata": {"zguard_policy_id": 3}}}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
assert mock_api_call.call_args.kwargs["policy_id"] == 3
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_policy_id_from_init(mock_api_call):
"""
Test policy_id is picked from guardrail initialization (lowest precedence).
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["test"]}
request_data = {"metadata": {}}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
assert mock_api_call.call_args.kwargs["policy_id"] == 100
@pytest.mark.asyncio
@patch(
"litellm.proxy.guardrails.guardrail_hooks.zscaler_ai_guard.ZscalerAIGuard.make_zscaler_ai_guard_api_call",
new_callable=AsyncMock,
)
async def test_policy_id_zero_from_request_metadata(mock_api_call):
"""
Test policy_id=0 is correctly picked. Make sure pick exact policy_id which users set
"""
guardrail = ZscalerAIGuard(policy_id=100)
inputs = {"texts": ["test"]}
request_data = {
"metadata": {
"zguard_policy_id": 0,
}
}
await guardrail.apply_guardrail(inputs, request_data, "request")
mock_api_call.assert_called_once()
assert mock_api_call.call_args.kwargs["policy_id"] == 0
+3 -1
View File
@@ -12,6 +12,8 @@
"@types/node": "^22.5.5"
},
"overrides": {
"glob": ">=11.1.0"
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1"
}
}
@@ -24,6 +24,8 @@
"react-dom": "^18.2.0"
},
"overrides": {
"glob": ">=11.1.0"
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1"
}
}
@@ -158,6 +158,76 @@ def test_get_combined_tool_content():
]
def test_get_combined_thinking_content_preserves_interleaved_blocks():
base_chunk = {
"id": "chatcmpl-123",
"object": "chat.completion.chunk",
"created": 1234567890,
"model": "claude-sonnet-4-20250514",
}
def make_chunk(**delta_kwargs):
return ModelResponseStream(
**base_chunk,
choices=[
StreamingChoices(
index=0,
delta=Delta(**delta_kwargs),
finish_reason=None,
)
],
)
chunks = [
make_chunk(role="assistant", content=None),
make_chunk(
thinking_blocks=[
{"type": "thinking", "thinking": "Step 1 analysis...", "signature": None}
]
),
make_chunk(
thinking_blocks=[
{"type": "thinking", "thinking": None, "signature": "sig_block1"}
]
),
make_chunk(
thinking_blocks=[
{
"type": "redacted_thinking",
"data": "EuoBCoYBGAIi...encrypted...",
}
]
),
make_chunk(
thinking_blocks=[
{"type": "thinking", "thinking": "Step 2 analysis...", "signature": None}
]
),
make_chunk(
thinking_blocks=[
{"type": "thinking", "thinking": None, "signature": "sig_block2"}
]
),
]
thinking_chunks = [
chunk for chunk in chunks if chunk["choices"][0]["delta"].get("thinking_blocks")
]
processor = ChunkProcessor(chunks=chunks)
result = processor.get_combined_thinking_content(thinking_chunks)
assert result is not None
assert len(result) == 3
assert result[0]["type"] == "thinking"
assert result[0]["thinking"] == "Step 1 analysis..."
assert result[0]["signature"] == "sig_block1"
assert result[1]["type"] == "redacted_thinking"
assert result[1]["data"] == "EuoBCoYBGAIi...encrypted..."
assert result[2]["type"] == "thinking"
assert result[2]["thinking"] == "Step 2 analysis..."
assert result[2]["signature"] == "sig_block2"
def test_cache_read_input_tokens_retained():
chunk1 = ModelResponseStream(
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
@@ -441,4 +511,4 @@ def test_stream_chunk_builder_anthropic_web_search():
assert usage.prompt_tokens == 50
assert usage.completion_tokens == 27
assert usage.total_tokens == 77
assert usage.server_tool_use['web_search_requests'] == 2
assert usage.server_tool_use['web_search_requests'] == 2
@@ -287,6 +287,114 @@ class TestOCIChatConfig:
# Verify the message content
assert transformed_request["chatRequest"]["message"] == "What is quantum computing?"
def test_transform_request_response_format_json_object(self):
"""
Tests that response_format type 'json_object' is uppercased to 'JSON_OBJECT' for generic OCI models.
"""
config = OCIChatConfig()
optional_params = {
"oci_compartment_id": TEST_COMPARTMENT_ID,
"response_format": {"type": "json_object"},
}
transformed_request = config.transform_request(
model=TEST_MODEL_NAME,
messages=TEST_MESSAGES, # type: ignore
optional_params=optional_params,
litellm_params={},
headers={},
)
rf = transformed_request["chatRequest"]["responseFormat"]
assert rf["type"] == "JSON_OBJECT"
def test_transform_request_response_format_text(self):
"""
Tests that response_format type 'text' is uppercased to 'TEXT' for generic OCI models.
"""
config = OCIChatConfig()
optional_params = {
"oci_compartment_id": TEST_COMPARTMENT_ID,
"response_format": {"type": "text"},
}
transformed_request = config.transform_request(
model=TEST_MODEL_NAME,
messages=TEST_MESSAGES, # type: ignore
optional_params=optional_params,
litellm_params={},
headers={},
)
rf = transformed_request["chatRequest"]["responseFormat"]
assert rf["type"] == "TEXT"
def test_transform_request_response_format_json_shorthand(self):
"""
Tests that response_format type 'json' is mapped to 'JSON_OBJECT' for generic OCI models.
"""
config = OCIChatConfig()
optional_params = {
"oci_compartment_id": TEST_COMPARTMENT_ID,
"response_format": {"type": "json"},
}
transformed_request = config.transform_request(
model=TEST_MODEL_NAME,
messages=TEST_MESSAGES, # type: ignore
optional_params=optional_params,
litellm_params={},
headers={},
)
rf = transformed_request["chatRequest"]["responseFormat"]
assert rf["type"] == "JSON_OBJECT"
def test_transform_response_without_token_details(self):
"""
Tests that responses missing completionTokensDetails and promptTokensDetails
are handled correctly (fields are optional).
"""
config = OCIChatConfig()
created_time = datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z")
mock_oci_response = {
"modelId": TEST_MODEL_NAME,
"modelVersion": "1.0",
"chatResponse": {
"apiFormat": "GENERIC",
"choices": [
{
"index": 0,
"message": {
"role": "ASSISTANT",
"content": [{"type": "TEXT", "text": "Hello!"}],
},
"finishReason": "STOP",
}
],
"timeCreated": created_time,
"usage": {
"promptTokens": 5,
"completionTokens": 10,
"totalTokens": 15,
},
},
}
response = httpx.Response(
status_code=200, json=mock_oci_response, headers={"Content-Type": "application/json"}
)
result = config.transform_response(
model=TEST_MODEL_NAME,
raw_response=response,
model_response=ModelResponse(),
logging_obj={}, # type: ignore
request_data={},
messages=[],
optional_params={},
litellm_params={},
encoding={},
)
assert isinstance(result, ModelResponse)
assert result.choices[0].message.content == "Hello!"
assert result.usage.prompt_tokens == 5 # type: ignore
assert result.usage.completion_tokens == 10 # type: ignore
assert result.usage.total_tokens == 15 # type: ignore
def test_transform_response_simple_text(self):
"""
Tests if a simple text response is transformed correctly.
@@ -239,6 +239,110 @@ class TestOCICohereToolCalls:
assert result.usage.completion_tokens == 22
assert result.usage.total_tokens == 48
def test_cohere_request_preserves_json_schema_response_format(self):
"""Ensure Cohere requests retain JSON schema payloads in responseFormat."""
config = OCIChatConfig()
messages = [{"role": "user", "content": "Return structured info"}]
response_format = {
"type": "json_schema",
"json_schema": {
"name": "test_schema",
"strict": True,
"schema": {
"type": "object",
"properties": {
"foo": {"type": "string"}
},
"required": ["foo"]
}
}
}
optional_params = {
"oci_compartment_id": TEST_COMPARTMENT_ID,
"response_format": response_format,
}
transformed_request = config.transform_request(
model="cohere.command-rplus",
messages=messages, # type: ignore[arg-type]
optional_params=optional_params,
litellm_params={},
headers={},
)
chat_request = transformed_request["chatRequest"]
assert chat_request["apiFormat"] == "COHERE"
assert "responseFormat" in chat_request
cohere_response_format = chat_request["responseFormat"]
assert cohere_response_format["type"] == "json_schema"
assert "json_schema" not in cohere_response_format
assert "jsonSchema" in cohere_response_format
assert cohere_response_format["jsonSchema"] == response_format["json_schema"]
def test_cohere_request_response_format_text_stays_lowercase(self):
"""Ensure Cohere keeps response_format type lowercase (e.g. 'text' not 'TEXT')."""
config = OCIChatConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"oci_compartment_id": TEST_COMPARTMENT_ID,
"response_format": {"type": "text"},
}
transformed_request = config.transform_request(
model="cohere.command-latest",
messages=messages, # type: ignore
optional_params=optional_params,
litellm_params={},
headers={},
)
chat_request = transformed_request["chatRequest"]
assert chat_request["apiFormat"] == "COHERE"
assert "responseFormat" in chat_request
assert chat_request["responseFormat"]["type"] == "text"
def test_cohere_tool_call_only_message_no_text(self):
"""Test chat history with an assistant message that has tool calls but no text content."""
config = OCIChatConfig()
messages = [
{"role": "user", "content": "What's the weather?"},
{
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"location": "Paris"}',
},
}
],
},
{
"role": "tool",
"content": "Sunny, 25C",
"tool_call_id": "call_1",
},
]
chat_history = config.adapt_messages_to_cohere_standard(messages)
# First message is the user message
assert chat_history[0].role == "USER"
assert chat_history[0].message == "What's the weather?"
# Second message is the assistant with tool calls and no text
assistant_msg = chat_history[1]
assert assistant_msg.role == "CHATBOT"
assert assistant_msg.message is None or assistant_msg.message == ""
assert assistant_msg.toolCalls is not None
assert len(assistant_msg.toolCalls) == 1
assert assistant_msg.toolCalls[0].name == "get_weather"
def test_cohere_chat_history_with_tool_calls(self):
"""Test chat history transformation with tool calls"""
config = OCIChatConfig()
@@ -45,68 +45,65 @@ def test_vertex_ai_anthropic_web_search_header_in_completion():
# Create the config instance
model_info = AnthropicModelInfo()
# Test the header generation directly
tools = [{"type": "web_search_20250305", "name": "web_search", "max_uses": 5}]
# Check if web search tool is detected
web_search_detected = model_info.is_web_search_tool_used(tools=tools)
assert web_search_detected is True, "Web search tool should be detected"
# Generate headers with is_vertex_request=True
headers = model_info.get_anthropic_headers(
api_key="test-key",
web_search_tool_used=web_search_detected,
is_vertex_request=True,
)
# Assert that the anthropic-beta header with web-search is present
assert "anthropic-beta" in headers, "anthropic-beta header should be present"
assert headers["anthropic-beta"] == "web-search-2025-03-05", \
f"anthropic-beta should be 'web-search-2025-03-05', got: {headers['anthropic-beta']}"
assert (
headers["anthropic-beta"] == "web-search-2025-03-05"
), f"anthropic-beta should be 'web-search-2025-03-05', got: {headers['anthropic-beta']}"
# Test that header is NOT added for non-Vertex requests
headers_non_vertex = model_info.get_anthropic_headers(
api_key="test-key",
web_search_tool_used=web_search_detected,
is_vertex_request=False,
)
# For non-Vertex (Anthropic-hosted), the web search header should NOT be in anthropic-beta
# because Anthropic doesn't require it
assert "anthropic-beta" not in headers_non_vertex or "web-search" not in headers_non_vertex.get("anthropic-beta", ""), \
"anthropic-beta with web-search should not be present for non-Vertex requests"
assert (
"anthropic-beta" not in headers_non_vertex
or "web-search" not in headers_non_vertex.get("anthropic-beta", "")
), "anthropic-beta with web-search should not be present for non-Vertex requests"
def test_vertex_ai_anthropic_context_management_compact_beta_header():
"""Test that context_management with compact adds the correct beta header for Vertex AI"""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"context_management": {
"edits": [
{
"type": "compact_20260112"
}
]
},
"context_management": {"edits": [{"type": "compact_20260112"}]},
"max_tokens": 100,
"is_vertex_request": True
"is_vertex_request": True,
}
result = config.transform_request(
model="claude-opus-4-6",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={}
headers={},
)
# Verify context_management is included
assert "context_management" in result
assert result["context_management"]["edits"][0]["type"] == "compact_20260112"
# Verify compact beta header is in anthropic_beta field
assert "anthropic_beta" in result
assert "compact-2026-01-12" in result["anthropic_beta"]
@@ -115,33 +112,27 @@ def test_vertex_ai_anthropic_context_management_compact_beta_header():
def test_vertex_ai_anthropic_context_management_mixed_edits():
"""Test that context_management with both compact and other edits adds both beta headers"""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"context_management": {
"edits": [
{
"type": "compact_20260112"
},
{
"type": "replace",
"message_id": "msg_123",
"content": "new content"
}
{"type": "compact_20260112"},
{"type": "replace", "message_id": "msg_123", "content": "new content"},
]
},
"max_tokens": 100,
"is_vertex_request": True
"is_vertex_request": True,
}
result = config.transform_request(
model="claude-opus-4-6",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={}
headers={},
)
# Verify both beta headers are present
assert "anthropic_beta" in result
assert "compact-2026-01-12" in result["anthropic_beta"]
@@ -151,58 +142,65 @@ def test_vertex_ai_anthropic_context_management_mixed_edits():
def test_vertex_ai_anthropic_structured_output_header_not_added():
"""Test that structured output beta headers are NOT added for Vertex AI requests"""
from litellm.llms.anthropic.chat.transformation import AnthropicConfig
config = AnthropicConfig()
# Test case 1: Vertex request with output_format should NOT add beta header
headers_vertex = {}
optional_params_vertex = {
'output_format': {
'type': 'json_schema',
'json_schema': {
'name': 'MathResult',
'schema': {'properties': {'result': {'type': 'integer'}}}
}
"output_format": {
"type": "json_schema",
"json_schema": {
"name": "MathResult",
"schema": {"properties": {"result": {"type": "integer"}}},
},
},
'is_vertex_request': True
"is_vertex_request": True,
}
result_vertex = config.update_headers_with_optional_anthropic_beta(headers_vertex, optional_params_vertex)
assert "anthropic-beta" not in result_vertex, \
f"Vertex request should NOT have anthropic-beta header for structured output, got: {result_vertex.get('anthropic-beta')}"
result_vertex = config.update_headers_with_optional_anthropic_beta(
headers_vertex, optional_params_vertex
)
assert (
"anthropic-beta" not in result_vertex
), f"Vertex request should NOT have anthropic-beta header for structured output, got: {result_vertex.get('anthropic-beta')}"
# Test case 2: Non-Vertex request with output_format SHOULD add beta header
headers_non_vertex = {}
optional_params_non_vertex = {
'output_format': {
'type': 'json_schema',
'json_schema': {
'name': 'MathResult',
'schema': {'properties': {'result': {'type': 'integer'}}}
}
"output_format": {
"type": "json_schema",
"json_schema": {
"name": "MathResult",
"schema": {"properties": {"result": {"type": "integer"}}},
},
},
'is_vertex_request': False
"is_vertex_request": False,
}
result_non_vertex = config.update_headers_with_optional_anthropic_beta(headers_non_vertex, optional_params_non_vertex)
assert "anthropic-beta" in result_non_vertex, \
"Non-Vertex request SHOULD have anthropic-beta header for structured output"
assert result_non_vertex["anthropic-beta"] == "structured-outputs-2025-11-13", \
f"Expected 'structured-outputs-2025-11-13', got: {result_non_vertex.get('anthropic-beta')}"
result_non_vertex = config.update_headers_with_optional_anthropic_beta(
headers_non_vertex, optional_params_non_vertex
)
assert (
"anthropic-beta" in result_non_vertex
), "Non-Vertex request SHOULD have anthropic-beta header for structured output"
assert (
result_non_vertex["anthropic-beta"] == "structured-outputs-2025-11-13"
), f"Expected 'structured-outputs-2025-11-13', got: {result_non_vertex.get('anthropic-beta')}"
def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
"""
Test fix for issue #18625: Claude Sonnet 4.5 on VertexAI should use tool-based
Test fix for issue #18625: Claude Sonnet 4.5 on VertexAI should use tool-based
structured outputs instead of output_format parameter.
This test verifies that:
1. Claude Sonnet 4.5 uses tool-based structured outputs on VertexAI
2. output_format parameter is removed from the final request
3. The fix prevents "Extra inputs are not permitted" error
"""
config = VertexAIAnthropicConfig()
# Test data matching the issue report
response_format = {
"type": "json_schema",
@@ -212,29 +210,23 @@ def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
"schema": {
"type": "object",
"properties": {
"question": {
"type": "string"
},
"response": {
"type": "string"
}
"question": {"type": "string"},
"response": {"type": "string"},
},
"required": ["question", "response"],
"additionalProperties": False
}
}
"additionalProperties": False,
},
},
}
messages = [
{"role": "user", "content": "Generate a question and answer about AI."}
]
messages = [{"role": "user", "content": "Generate a question and answer about AI."}]
# Test parameters that would trigger the issue
non_default_params = {
"response_format": response_format,
"max_tokens": 1000,
}
# Test 1: Verify map_openai_params forces tool-based approach for Claude Sonnet 4.5
optional_params = {}
result_params = config.map_openai_params(
@@ -243,17 +235,19 @@ def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
model="claude-3-5-sonnet-20241022", # Claude Sonnet 4.5 model
drop_params=False,
)
# Should have tools and tool_choice (tool-based approach)
assert "tools" in result_params, "Tools should be present for structured output"
assert "tool_choice" in result_params, "Tool choice should be present for structured output"
assert (
"tool_choice" in result_params
), "Tool choice should be present for structured output"
assert "json_mode" in result_params, "JSON mode should be enabled"
# Verify the tool is the response format tool
tools = result_params["tools"]
assert len(tools) == 1, "Should have exactly one tool for response format"
assert tools[0]["name"] == "json_tool_call", "Tool should be named json_tool_call"
# Test 2: Verify transform_request removes output_format parameter
# Simulate what would happen if parent class added output_format
test_data = {
@@ -264,20 +258,22 @@ def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
"tool_choice": result_params["tool_choice"],
"output_format": { # This would be added by parent class for Sonnet 4.5
"type": "json_schema",
"schema": response_format["json_schema"]["schema"]
}
"schema": response_format["json_schema"]["schema"],
},
}
# Mock the parent transform_request to return data with output_format
original_transform = config.__class__.__bases__[0].transform_request
def mock_transform_request(self, model, messages, optional_params, litellm_params, headers):
def mock_transform_request(
self, model, messages, optional_params, litellm_params, headers
):
# Return test data that includes output_format
return test_data.copy()
# Temporarily replace parent method
config.__class__.__bases__[0].transform_request = mock_transform_request
try:
final_data = config.transform_request(
model="claude-3-5-sonnet-20241022",
@@ -286,13 +282,15 @@ def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
litellm_params={},
headers={},
)
# Verify that output_format was removed (fixes the "Extra inputs are not permitted" error)
assert "output_format" not in final_data, "output_format should be removed for VertexAI"
assert (
"output_format" not in final_data
), "output_format should be removed for VertexAI"
assert "model" not in final_data, "model should be removed for VertexAI"
assert "tools" in final_data, "tools should still be present"
assert "tool_choice" in final_data, "tool_choice should still be present"
finally:
# Restore original method
config.__class__.__bases__[0].transform_request = original_transform
@@ -300,43 +298,149 @@ def test_vertex_ai_claude_sonnet_4_5_structured_output_fix():
def test_vertex_ai_anthropic_other_models_still_use_tools():
"""
Test that other Anthropic models (non-Sonnet 4.5) on VertexAI also use tool-based
Test that other Anthropic models (non-Sonnet 4.5) on VertexAI also use tool-based
structured outputs, ensuring consistency across all models.
"""
config = VertexAIAnthropicConfig()
response_format = {
"type": "json_schema",
"json_schema": {
"name": "test_schema",
"schema": {
"type": "object",
"properties": {
"result": {"type": "string"}
}
}
}
"schema": {"type": "object", "properties": {"result": {"type": "string"}}},
},
}
# Test with Claude 3 Sonnet (not 4.5)
non_default_params = {"response_format": response_format}
optional_params = {}
result_params = config.map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,
model="claude-3-sonnet-20240229",
drop_params=False,
)
# Should still use tool-based approach
assert "tools" in result_params, "Claude 3 Sonnet should also use tool-based structured output"
assert (
"tools" in result_params
), "Claude 3 Sonnet should also use tool-based structured output"
assert "tool_choice" in result_params, "Tool choice should be present"
assert "json_mode" in result_params, "JSON mode should be enabled"
def test_vertex_ai_anthropic_extra_headers_beta_propagation():
"""Test that anthropic-beta values from extra_headers are propagated to the
anthropic_beta request body field for Vertex AI requests.
Vertex AI requires beta flags in the request body (anthropic_beta array),
not as HTTP headers. This mirrors the Bedrock handler's behavior of
extracting user-specified beta headers.
"""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"max_tokens": 100,
"is_vertex_request": True,
"extra_headers": {
"anthropic-beta": "interleaved-thinking-2025-05-14",
},
}
result = config.transform_request(
model="claude-sonnet-4-20250514",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={},
)
assert "anthropic_beta" in result
assert "interleaved-thinking-2025-05-14" in result["anthropic_beta"]
assert "extra_headers" not in result
def test_vertex_ai_anthropic_extra_headers_beta_merged_with_auto_betas():
"""Test that extra_headers betas are merged with auto-detected betas
rather than replacing them."""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"max_tokens": 100,
"is_vertex_request": True,
"extra_headers": {
"anthropic-beta": "interleaved-thinking-2025-05-14",
},
"context_management": {"edits": [{"type": "compact_20260112"}]},
}
result = config.transform_request(
model="claude-opus-4-6",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={},
)
assert "anthropic_beta" in result
assert "interleaved-thinking-2025-05-14" in result["anthropic_beta"]
assert "compact-2026-01-12" in result["anthropic_beta"]
def test_vertex_ai_anthropic_extra_headers_comma_separated_betas():
"""Test that comma-separated beta values in extra_headers are all extracted."""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"max_tokens": 100,
"is_vertex_request": True,
"extra_headers": {
"anthropic-beta": "interleaved-thinking-2025-05-14,dev-full-thinking-2025-05-14",
},
}
result = config.transform_request(
model="claude-sonnet-4-20250514",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={},
)
assert "anthropic_beta" in result
assert "interleaved-thinking-2025-05-14" in result["anthropic_beta"]
assert "dev-full-thinking-2025-05-14" in result["anthropic_beta"]
def test_vertex_ai_anthropic_no_extra_headers_unchanged():
"""Test that requests without extra_headers still work normally."""
config = VertexAIAnthropicConfig()
messages = [{"role": "user", "content": "Hello"}]
optional_params = {
"max_tokens": 100,
"is_vertex_request": True,
}
result = config.transform_request(
model="claude-sonnet-4-20250514",
messages=messages,
optional_params=optional_params,
litellm_params={},
headers={},
)
assert "anthropic_beta" not in result
assert "extra_headers" not in result
def test_vertex_ai_partner_models_anthropic_remove_prompt_caching_scope_beta_header():
"""
Test that remove_unsupported_beta correctly filters out prompt-caching-scope-2026-01-05
Test that remove_unsupported_beta correctly filters out prompt-caching-scope-2026-01-05
from the anthropic-beta headers.
"""
from litellm.llms.vertex_ai.vertex_ai_partner_models.anthropic.experimental_pass_through.transformation import (
@@ -352,13 +456,18 @@ def test_vertex_ai_partner_models_anthropic_remove_prompt_caching_scope_beta_hea
headers = update_headers_with_filtered_beta(headers, "vertex_ai")
beta_header = headers.get("anthropic-beta")
assert PROMPT_CACHING_BETA_HEADER not in (beta_header or ""), \
f"{PROMPT_CACHING_BETA_HEADER} should be filtered out"
assert "other-feature" in (beta_header or ""), \
"Other non-excluded beta headers should remain"
assert "web-search-2025-03-05" in (beta_header or ""), \
"Other non-excluded beta headers should remain"
assert PROMPT_CACHING_BETA_HEADER not in (
beta_header or ""
), f"{PROMPT_CACHING_BETA_HEADER} should be filtered out"
assert "other-feature" in (
beta_header or ""
), "Other non-excluded beta headers should remain"
assert "web-search-2025-03-05" in (
beta_header or ""
), "Other non-excluded beta headers should remain"
# If prompt-caching was the only value, header should be removed completely
headers2 = {"anthropic-beta": PROMPT_CACHING_BETA_HEADER}
headers2 = update_headers_with_filtered_beta(headers2, "vertex_ai")
assert "anthropic-beta" not in headers2, "Header should be removed if no supported values remain"
assert (
"anthropic-beta" not in headers2
), "Header should be removed if no supported values remain"
@@ -0,0 +1,162 @@
"""
Tests for litellm/proxy/management_endpoints/common_utils.py
Covers the fix for GitHub issue #20304:
Empty guardrails/policies arrays sent by the UI should NOT trigger the
enterprise (premium) license check, but should still be applied so that
users can intentionally clear previously-set fields.
"""
from unittest.mock import patch
from litellm.proxy.management_endpoints.common_utils import (
_update_metadata_fields,
)
class TestUpdateMetadataFieldsEmptyCollections:
"""
Regression tests for issue #20304.
The UI sends empty arrays (`[]`) for enterprise-only fields like
guardrails, policies, and logging even when the user hasn't configured
these features. The backend must not treat empty collections as an
intent to use the feature, and therefore must not trigger the premium
license check.
However, empty collections must still be written into metadata so that
users can intentionally clear a previously-set field (e.g. removing all
guardrails by sending `guardrails: []`).
"""
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_empty_list_does_not_trigger_premium_check(self, mock_premium_check):
"""Empty lists for premium fields must not trigger the premium check."""
updated_kv = {
"team_id": "test-team",
"guardrails": [],
"policies": [],
"logging": [],
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_not_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_empty_list_still_updates_metadata(self, mock_premium_check):
"""
Empty lists must still be moved into metadata so users can clear
previously-set fields (e.g. remove all guardrails).
"""
updated_kv = {
"team_id": "test-team",
"guardrails": [],
"policies": [],
}
_update_metadata_fields(updated_kv=updated_kv)
# The fields should have been moved into metadata
assert "guardrails" not in updated_kv, (
"guardrails should be popped from top-level"
)
assert "policies" not in updated_kv, (
"policies should be popped from top-level"
)
assert updated_kv["metadata"]["guardrails"] == []
assert updated_kv["metadata"]["policies"] == []
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_empty_dict_does_not_trigger_premium_check(self, mock_premium_check):
"""Empty dicts for premium fields must not trigger the premium check."""
updated_kv = {
"team_id": "test-team",
"secret_manager_settings": {},
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_not_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_empty_dict_still_updates_metadata(self, mock_premium_check):
"""
Empty dicts must still be moved into metadata so users can clear
previously-set fields.
"""
updated_kv = {
"team_id": "test-team",
"secret_manager_settings": {},
}
_update_metadata_fields(updated_kv=updated_kv)
assert "secret_manager_settings" not in updated_kv, (
"secret_manager_settings should be popped from top-level"
)
assert updated_kv["metadata"]["secret_manager_settings"] == {}
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_none_value_does_not_trigger_premium_check(self, mock_premium_check):
"""None values for premium fields should be silently ignored."""
updated_kv = {
"team_id": "test-team",
"guardrails": None,
"policies": None,
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_not_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_absent_fields_do_not_trigger_premium_check(self, mock_premium_check):
"""Fields not present in the dict should not trigger premium check."""
updated_kv = {
"team_id": "test-team",
"team_alias": "example-team",
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_not_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_non_empty_list_triggers_premium_check(self, mock_premium_check):
"""Non-empty lists for premium fields should trigger the premium check."""
updated_kv = {
"team_id": "test-team",
"guardrails": ["my-guardrail"],
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_non_empty_value_triggers_premium_check(self, mock_premium_check):
"""Non-empty string values for premium fields should trigger the premium check."""
updated_kv = {
"team_id": "test-team",
"tags": ["production"],
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_called()
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_non_empty_list_updates_metadata(self, mock_premium_check):
"""Non-empty lists should be moved into metadata."""
updated_kv = {
"team_id": "test-team",
"guardrails": ["my-guardrail"],
}
_update_metadata_fields(updated_kv=updated_kv)
assert "guardrails" not in updated_kv
assert updated_kv["metadata"]["guardrails"] == ["my-guardrail"]
@patch("litellm.proxy.management_endpoints.common_utils._premium_user_check")
def test_ui_typical_payload_does_not_trigger_premium_check(self, mock_premium_check):
"""
Simulate the exact payload the UI sends when no enterprise features
are configured. This must NOT trigger the premium check.
"""
# This is the payload structure the UI sends (from issue #20304)
updated_kv = {
"team_id": "67848772-1a8b-4343-938c-17e60f1db860",
"team_alias": "example-team",
"models": ["gpt-4"],
"metadata": {
"guardrails": [],
"logging": [],
},
"policies": [],
}
_update_metadata_fields(updated_kv=updated_kv)
mock_premium_check.assert_not_called()
@@ -229,3 +229,164 @@ def test_tool_call_arguments_are_chunked_to_match_openai_behavior():
assert sequence_numbers == sorted(sequence_numbers)
assert len(set(sequence_numbers)) == len(sequence_numbers) # All unique
def test_tool_call_delta_without_id_uses_index_mapping():
iterator = LiteLLMCompletionStreamingIterator(
model="test-model",
litellm_custom_stream_wrapper=AsyncMock(),
request_input="Test input",
responses_api_request={},
)
chunks = [
[
{
"index": 0,
"id": "call_abc123",
"type": "function",
"function": {"name": "get_weather", "arguments": '{"lo'},
}
],
[{"index": 0, "type": "function", "function": {"arguments": 'cation":'}}],
[{"index": 0, "type": "function", "function": {"arguments": ' "New'}}],
[{"index": 0, "type": "function", "function": {"arguments": ' York"}'}}],
]
for tool_calls in chunks:
iterator._queue_tool_call_delta_events(tool_calls)
all_events = []
while iterator._pending_tool_events:
all_events.append(iterator._pending_tool_events.pop(0))
delta_events = [
evt
for evt in all_events
if evt.type == ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA
]
streamed_arguments = "".join(evt.delta for evt in delta_events)
assert streamed_arguments == '{"location": "New York"}'
output_item_added_events = [
evt
for evt in all_events
if evt.type == ResponsesAPIStreamEvents.OUTPUT_ITEM_ADDED
]
assert len(output_item_added_events) == 1
assert output_item_added_events[0].item.id == "call_abc123"
def test_parallel_tool_calls_without_ids_use_index_mapping():
iterator = LiteLLMCompletionStreamingIterator(
model="test-model",
litellm_custom_stream_wrapper=AsyncMock(),
request_input="Test input",
responses_api_request={},
)
iterator._queue_tool_call_delta_events(
[
{
"index": 0,
"id": "call_a",
"type": "function",
"function": {"name": "tool_a", "arguments": '{"x":'},
},
{
"index": 1,
"id": "call_b",
"type": "function",
"function": {"name": "tool_b", "arguments": '{"y":'},
},
]
)
iterator._queue_tool_call_delta_events(
[
{"index": 0, "type": "function", "function": {"arguments": "1}"}},
{"index": 1, "type": "function", "function": {"arguments": "2}"}},
]
)
all_events = []
while iterator._pending_tool_events:
all_events.append(iterator._pending_tool_events.pop(0))
output_item_added_events = [
evt
for evt in all_events
if evt.type == ResponsesAPIStreamEvents.OUTPUT_ITEM_ADDED
]
assert len(output_item_added_events) == 2
delta_events = [
evt
for evt in all_events
if evt.type == ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA
]
arguments_by_call_id = {}
for evt in delta_events:
arguments_by_call_id.setdefault(evt.item_id, "")
arguments_by_call_id[evt.item_id] += evt.delta
assert arguments_by_call_id["call_a"] == '{"x":1}'
assert arguments_by_call_id["call_b"] == '{"y":2}'
def test_reused_index_with_new_call_id_marks_fallback_ambiguous():
iterator = LiteLLMCompletionStreamingIterator(
model="test-model",
litellm_custom_stream_wrapper=AsyncMock(),
request_input="Test input",
responses_api_request={},
)
iterator._queue_tool_call_delta_events(
[
{
"index": 0,
"id": "call_a",
"type": "function",
"function": {"name": "tool_a", "arguments": '{"a":'},
}
]
)
iterator._queue_tool_call_delta_events(
[
{
"index": 0,
"id": "call_b",
"type": "function",
"function": {"name": "tool_b", "arguments": '{"b":'},
}
]
)
# Ambiguous chunk: index reused and id missing. We should skip fallback rather than misroute.
iterator._queue_tool_call_delta_events(
[
{
"index": 0,
"type": "function",
"function": {"arguments": "1}"},
}
]
)
all_events = []
while iterator._pending_tool_events:
all_events.append(iterator._pending_tool_events.pop(0))
delta_events = [
evt
for evt in all_events
if evt.type == ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA
]
arguments_by_call_id = {}
for evt in delta_events:
arguments_by_call_id.setdefault(evt.item_id, "")
arguments_by_call_id[evt.item_id] += evt.delta
assert arguments_by_call_id["call_a"] == '{"a":'
assert arguments_by_call_id["call_b"] == '{"b":'
assert arguments_by_call_id["call_a"] != '{"a":1}'
assert arguments_by_call_id["call_b"] != '{"b":1}'
+121
View File
@@ -1869,3 +1869,124 @@ async def test_aguardrail():
assert result["result"] == "success"
assert result["selected_guardrail"]["id"] == "guardrail-1"
@pytest.mark.asyncio
async def test_anthropic_messages_call_type_is_cached():
"""
Regression test: Verify that anthropic_messages call type is allowed
in PromptCachingDeploymentCheck.async_log_success_event.
"""
import asyncio
from litellm.router_utils.pre_call_checks.prompt_caching_deployment_check import (
PromptCachingDeploymentCheck,
)
from litellm.router_utils.prompt_caching_cache import PromptCachingCache
from litellm.caching.dual_cache import DualCache
from litellm.types.utils import CallTypes
from litellm.types.utils import (
StandardLoggingPayload,
StandardLoggingModelInformation,
StandardLoggingMetadata,
StandardLoggingHiddenParams,
)
# Create mock standard logging payload inline
def create_standard_logging_payload() -> StandardLoggingPayload:
return StandardLoggingPayload(
id="test_id",
call_type="completion",
response_cost=0.1,
response_cost_failure_debug_info=None,
status="success",
total_tokens=30,
prompt_tokens=20,
completion_tokens=10,
startTime=1234567890.0,
endTime=1234567891.0,
completionStartTime=1234567890.5,
model_map_information=StandardLoggingModelInformation(
model_map_key="gpt-3.5-turbo", model_map_value=None
),
model="gpt-3.5-turbo",
model_id="model-123",
model_group="openai-gpt",
api_base="https://api.openai.com",
metadata=StandardLoggingMetadata(
user_api_key_hash="test_hash",
user_api_key_org_id=None,
user_api_key_alias="test_alias",
user_api_key_team_id="test_team",
user_api_key_user_id="test_user",
user_api_key_team_alias="test_team_alias",
spend_logs_metadata=None,
requester_ip_address="127.0.0.1",
requester_metadata=None,
),
cache_hit=False,
cache_key=None,
saved_cache_cost=0.0,
request_tags=[],
end_user=None,
requester_ip_address="127.0.0.1",
messages=[{"role": "user", "content": "Hello, world!"}],
response={"choices": [{"message": {"content": "Hi there!"}}]},
error_str=None,
model_parameters={"stream": True},
hidden_params=StandardLoggingHiddenParams(
model_id="model-123",
cache_key=None,
api_base="https://api.openai.com",
response_cost="0.1",
additional_headers=None,
),
)
cache = DualCache()
deployment_check = PromptCachingDeploymentCheck(cache=cache)
prompt_cache = PromptCachingCache(cache=cache)
# Create messages with enough tokens to pass the caching threshold
test_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "test long message here" * 1024,
"cache_control": {
"type": "ephemeral",
"ttl": "5m"
}
}
]
}
]
test_model_id = "test-model-id-123"
# Create a payload with anthropic_messages call type
payload = create_standard_logging_payload()
payload["call_type"] = CallTypes.anthropic_messages.value
payload["messages"] = test_messages
payload["model"] = "anthropic/claude-3-5-sonnet-20240620"
payload["model_id"] = test_model_id
# Log the success event (should cache the model_id)
await deployment_check.async_log_success_event(
kwargs={"standard_logging_object": payload},
response_obj={},
start_time=1234567890.0,
end_time=1234567891.0,
)
# Small delay to ensure cache write completes
await asyncio.sleep(0.1)
# Verify that the model_id was actually cached
cached_result = await prompt_cache.async_get_model_id(
messages=test_messages,
tools=None,
)
# This assertion will FAIL if anthropic_messages is filtered out
assert cached_result is not None, "Model ID should be cached for anthropic_messages call type"
assert cached_result["model_id"] == test_model_id, f"Expected {test_model_id}, got {cached_result['model_id']}"
+175
View File
@@ -916,6 +916,181 @@ def test_encode_video_id_with_provider_handles_azure_video_prefix():
)
assert encoded_twice == encoded_id # Should return the same encoded ID
class TestVideoListTransformation:
"""Tests for video list request/response transformation with provider ID encoding."""
def test_transform_video_list_response_encodes_first_id_and_last_id(self):
"""Verify that first_id and last_id are encoded with provider metadata."""
config = OpenAIVideoConfig()
mock_http_response = MagicMock()
mock_http_response.json.return_value = {
"object": "list",
"data": [
{
"id": "video_aaa",
"object": "video",
"model": "sora-2",
"status": "completed",
},
{
"id": "video_bbb",
"object": "video",
"model": "sora-2",
"status": "completed",
},
],
"first_id": "video_aaa",
"last_id": "video_bbb",
"has_more": False,
}
result = config.transform_video_list_response(
raw_response=mock_http_response,
logging_obj=MagicMock(),
custom_llm_provider="azure",
)
from litellm.types.videos.utils import decode_video_id_with_provider
# data[].id should be encoded
for item in result["data"]:
decoded = decode_video_id_with_provider(item["id"])
assert decoded["custom_llm_provider"] == "azure"
# first_id and last_id should also be encoded
first_decoded = decode_video_id_with_provider(result["first_id"])
assert first_decoded["custom_llm_provider"] == "azure"
assert first_decoded["video_id"] == "video_aaa"
assert first_decoded["model_id"] == "sora-2"
last_decoded = decode_video_id_with_provider(result["last_id"])
assert last_decoded["custom_llm_provider"] == "azure"
assert last_decoded["video_id"] == "video_bbb"
assert last_decoded["model_id"] == "sora-2"
def test_transform_video_list_response_no_provider_leaves_ids_unchanged(self):
"""When custom_llm_provider is None, all IDs should remain unchanged."""
config = OpenAIVideoConfig()
mock_http_response = MagicMock()
mock_http_response.json.return_value = {
"object": "list",
"data": [
{"id": "video_aaa", "object": "video", "model": "sora-2", "status": "completed"},
],
"first_id": "video_aaa",
"last_id": "video_aaa",
"has_more": False,
}
result = config.transform_video_list_response(
raw_response=mock_http_response,
logging_obj=MagicMock(),
custom_llm_provider=None,
)
assert result["data"][0]["id"] == "video_aaa"
assert result["first_id"] == "video_aaa"
assert result["last_id"] == "video_aaa"
def test_transform_video_list_response_missing_pagination_fields(self):
"""first_id / last_id may be absent or null; should not raise."""
config = OpenAIVideoConfig()
mock_http_response = MagicMock()
mock_http_response.json.return_value = {
"object": "list",
"data": [
{"id": "video_aaa", "object": "video", "model": "sora-2", "status": "completed"},
],
"has_more": False,
}
result = config.transform_video_list_response(
raw_response=mock_http_response,
logging_obj=MagicMock(),
custom_llm_provider="azure",
)
# data[].id should still be encoded
from litellm.types.videos.utils import decode_video_id_with_provider
decoded = decode_video_id_with_provider(result["data"][0]["id"])
assert decoded["custom_llm_provider"] == "azure"
# first_id / last_id should not be present
assert "first_id" not in result
assert "last_id" not in result
def test_transform_video_list_request_decodes_after_parameter(self):
"""Encoded 'after' cursor should be decoded back to the raw provider ID."""
from litellm.types.videos.utils import encode_video_id_with_provider
config = OpenAIVideoConfig()
raw_id = "video_69888baee890819086dd3366bfc372fe"
encoded_id = encode_video_id_with_provider(raw_id, "azure", "sora-2")
url, params = config.transform_video_list_request(
api_base="https://my-resource.openai.azure.com/openai/v1/videos",
litellm_params=MagicMock(),
headers={},
after=encoded_id,
limit=10,
)
assert params["after"] == raw_id
assert params["limit"] == "10"
def test_transform_video_list_request_passes_through_plain_after(self):
"""A plain (non-encoded) 'after' value should pass through unchanged."""
config = OpenAIVideoConfig()
url, params = config.transform_video_list_request(
api_base="https://api.openai.com/v1/videos",
litellm_params=MagicMock(),
headers={},
after="video_plain_id",
)
assert params["after"] == "video_plain_id"
def test_transform_video_list_roundtrip(self):
"""first_id from list response should decode correctly when used as after parameter."""
config = OpenAIVideoConfig()
# Simulate a list response
mock_http_response = MagicMock()
mock_http_response.json.return_value = {
"object": "list",
"data": [
{"id": "video_aaa", "object": "video", "model": "sora-2", "status": "completed"},
{"id": "video_bbb", "object": "video", "model": "sora-2", "status": "completed"},
],
"first_id": "video_aaa",
"last_id": "video_bbb",
"has_more": True,
}
list_result = config.transform_video_list_response(
raw_response=mock_http_response,
logging_obj=MagicMock(),
custom_llm_provider="azure",
)
# Use the encoded last_id as the 'after' cursor for the next page
_, params = config.transform_video_list_request(
api_base="https://my-resource.openai.azure.com/openai/v1/videos",
litellm_params=MagicMock(),
headers={},
after=list_result["last_id"],
)
# The after param sent to the upstream API should be the raw video ID
assert params["after"] == "video_bbb"
class TestVideoEndpointsProxyLitellmParams:
"""Test that video proxy endpoints (status, content, remix) respect litellm_params from proxy config."""
+2
View File
@@ -84,6 +84,8 @@
"mermaid": ">=11.10.0",
"js-yaml": ">=4.1.1",
"glob": ">=11.1.0",
"tar": ">=7.5.7",
"@isaacs/brace-expansion": ">=5.0.1",
"node-forge": ">=1.3.2",
"lodash-es": ">=4.17.23",
"lodash": ">=4.17.23"
@@ -542,3 +542,86 @@ it("should display 'Default Proxy Admin' for created_by when value is 'default_u
expect(defaultProxyAdminElements.length).toBeGreaterThan(0);
});
});
it("should render table without crashing when models is null", async () => {
const keyWithNullModels = {
...mockKey,
models: null as unknown as string[],
};
mockUseFilterLogic.mockReturnValue({
filters: {
"Team ID": "",
"Organization ID": "",
"Key Alias": "",
"User ID": "",
"Sort By": "created_at",
"Sort Order": "desc",
},
filteredKeys: [keyWithNullModels],
allKeyAliases: ["test-key-alias"],
allTeams: [mockTeam],
allOrganizations: [mockOrganization],
handleFilterChange: vi.fn(),
handleFilterReset: vi.fn(),
});
const mockProps = {
teams: [mockTeam],
organizations: [mockOrganization],
onSortChange: vi.fn(),
currentSort: {
sortBy: "created_at",
sortOrder: "desc" as const,
},
};
// This should not throw an error
renderWithProviders(<VirtualKeysTable {...mockProps} />);
await waitFor(() => {
expect(screen.getByText("Test Key Alias")).toBeInTheDocument();
});
});
it("should render table without crashing when models is undefined", async () => {
const keyWithUndefinedModels = {
...mockKey,
models: undefined as unknown as string[],
};
mockUseFilterLogic.mockReturnValue({
filters: {
"Team ID": "",
"Organization ID": "",
"Key Alias": "",
"User ID": "",
"Sort By": "created_at",
"Sort Order": "desc",
},
filteredKeys: [keyWithUndefinedModels],
allKeyAliases: ["test-key-alias"],
allTeams: [mockTeam],
allOrganizations: [mockOrganization],
handleFilterChange: vi.fn(),
handleFilterReset: vi.fn(),
});
const mockProps = {
teams: [mockTeam],
organizations: [mockOrganization],
onSortChange: vi.fn(),
currentSort: {
sortBy: "created_at",
sortOrder: "desc" as const,
},
};
// This should not throw an error
renderWithProviders(<VirtualKeysTable {...mockProps} />);
await waitFor(() => {
expect(screen.getByText("Test Key Alias")).toBeInTheDocument();
});
});
@@ -727,7 +727,7 @@ export function VirtualKeysTable({ teams, organizations, onSortChange, currentSo
whiteSpace: "pre-wrap",
overflow: "hidden",
}}
className={`py-0.5 max-h-8 overflow-hidden text-ellipsis whitespace-nowrap ${cell.column.id === "models" && (cell.getValue() as string[]).length > 3 ? "px-0" : ""}`}
className={`py-0.5 max-h-8 overflow-hidden text-ellipsis whitespace-nowrap ${cell.column.id === "models" && Array.isArray(cell.getValue()) && (cell.getValue() as string[]).length > 3 ? "px-0" : ""}`}
>
{flexRender(cell.column.columnDef.cell, cell.getContext())}
</TableCell>
@@ -465,8 +465,8 @@ const TeamInfoView: React.FC<TeamInfoProps> = ({
budget_duration: values.budget_duration,
metadata: {
...parsedMetadata,
guardrails: values.guardrails || [],
logging: values.logging_settings || [],
...(values.guardrails?.length > 0 ? { guardrails: values.guardrails } : {}),
...(values.logging_settings?.length > 0 ? { logging: values.logging_settings } : {}),
disable_global_guardrails: values.disable_global_guardrails || false,
soft_budget_alerting_emails:
typeof values.soft_budget_alerting_emails === "string"
@@ -477,7 +477,7 @@ const TeamInfoView: React.FC<TeamInfoProps> = ({
: values.soft_budget_alerting_emails || [],
...(secretManagerSettings !== undefined ? { secret_manager_settings: secretManagerSettings } : {}),
},
policies: values.policies || [],
...(values.policies?.length > 0 ? { policies: values.policies } : {}),
organization_id: values.organization_id,
};
@@ -85,7 +85,7 @@ export function LogDetailsDrawer({
// Check if request/response data is present
const hasMessages = checkHasMessages(logEntry.messages);
const hasResponse = checkHasResponse(logEntry.response);
const missingData = !hasMessages && !hasResponse;
const missingData = !hasMessages && !hasResponse && !hasError;
// Guardrail data
const guardrailInfo = metadata?.guardrail_information;
@@ -206,6 +206,7 @@ export function LogDetailsDrawer({
{/* Request/Response JSON - Collapsible */}
<RequestResponseSection
hasResponse={hasResponse}
hasError={hasError}
getRawRequest={getRawRequest}
getFormattedResponse={getFormattedResponse}
logEntry={logEntry}
@@ -339,6 +340,7 @@ function MetricsSection({ logEntry, metadata }: { logEntry: LogEntry; metadata:
interface RequestResponseSectionProps {
hasResponse: boolean;
hasError: boolean;
getRawRequest: () => any;
getFormattedResponse: () => any;
logEntry: LogEntry;
@@ -346,6 +348,7 @@ interface RequestResponseSectionProps {
function RequestResponseSection({
hasResponse,
hasError,
getRawRequest,
getFormattedResponse,
logEntry,
@@ -423,7 +426,7 @@ function RequestResponseSection({
text: getCopyText(),
tooltips: ["Copy JSON", "Copied!"]
}}
disabled={activeTab === TAB_RESPONSE && !hasResponse}
disabled={activeTab === TAB_RESPONSE && !hasResponse && !hasError}
/>
}
items={[
@@ -441,7 +444,7 @@ function RequestResponseSection({
label: "Response",
children: (
<div style={{ paddingTop: SPACING_XLARGE, paddingBottom: SPACING_XLARGE }}>
{hasResponse ? (
{hasResponse || hasError ? (
<JsonViewer data={getFormattedResponse()} mode="formatted" />
) : (
<div style={{ textAlign: "center", padding: 20, color: "#999", fontStyle: "italic" }}>
@@ -188,4 +188,78 @@ describe("RequestResponsePanel", () => {
expect(responseData).toEqual({ responseData: "this should appear in response" });
expect(responseData).not.toEqual({ requestData: "this should not appear in response" });
});
it("should show error response data when hasError is true and hasResponse is false", () => {
const failedLogEntry: LogEntry = {
...baseLogEntry,
messages: [],
response: {},
metadata: {
status: "failure",
error_information: {
error_message: "Model not found",
error_class: "NotFoundError",
error_code: 404,
},
additional_usage_values: {
cache_read_input_tokens: 0,
cache_creation_input_tokens: 0,
},
},
};
const errorResponse = { error: { message: "Model not found", type: "NotFoundError", code: 404, param: null } };
const mockGetRawRequest = vi.fn().mockReturnValue({ messages: [] });
const mockFormattedResponse = vi.fn().mockReturnValue(errorResponse);
render(
<RequestResponsePanel
row={{ original: failedLogEntry }}
hasMessages={false}
hasResponse={false}
hasError={true}
errorInfo={failedLogEntry.metadata.error_information}
getRawRequest={mockGetRawRequest}
formattedResponse={mockFormattedResponse}
/>,
);
expect(screen.queryByText("Response data not available")).not.toBeInTheDocument();
expect(mockFormattedResponse).toHaveBeenCalled();
const copyButtons = screen.getAllByRole("button");
const copyResponseButton = copyButtons.find((button) => button.getAttribute("title") === "Copy response");
expect(copyResponseButton).not.toBeDisabled();
});
it("should show Response data not available when hasResponse and hasError are both false", () => {
const mockGetRawRequest = vi.fn().mockReturnValue({ messages: [] });
const mockFormattedResponse = vi.fn().mockReturnValue({});
render(
<RequestResponsePanel
row={{ original: baseLogEntry }}
hasMessages={false}
hasResponse={false}
hasError={false}
errorInfo={null}
getRawRequest={mockGetRawRequest}
formattedResponse={mockFormattedResponse}
/>,
);
expect(screen.getByText("Response data not available")).toBeInTheDocument();
});
it("should show error code in response header when hasError is true", () => {
const errorInfo = { error_message: "Rate limit exceeded", error_class: "RateLimitError", error_code: 429 };
const mockGetRawRequest = vi.fn().mockReturnValue({ messages: [] });
const mockFormattedResponse = vi.fn().mockReturnValue({ error: { message: "Rate limit exceeded", type: "RateLimitError", code: 429, param: null } });
render(
<RequestResponsePanel
row={{ original: baseLogEntry }}
hasMessages={false}
hasResponse={false}
hasError={true}
errorInfo={errorInfo}
getRawRequest={mockGetRawRequest}
formattedResponse={mockFormattedResponse}
/>,
);
expect(screen.getByText(/HTTP code 429/)).toBeInTheDocument();
});
});
@@ -113,7 +113,7 @@ export function RequestResponsePanel({
onClick={handleCopyResponse}
className="p-1 hover:bg-gray-200 rounded"
title="Copy response"
disabled={!hasResponse}
disabled={!hasResponse && !hasError}
>
<svg
xmlns="http://www.w3.org/2000/svg"
@@ -132,7 +132,7 @@ export function RequestResponsePanel({
</button>
</div>
<div className="p-4 overflow-auto max-h-96 w-full max-w-full box-border">
{hasResponse ? (
{hasResponse || hasError ? (
<div className="[&_[role='tree']]:bg-white [&_[role='tree']]:text-slate-900">
<JsonView data={formattedResponse()} style={defaultStyles} clickToExpandNode />
</div>
@@ -822,7 +822,7 @@ export function RequestViewer({ row, onOpenSettings }: { row: Row<LogEntry>; onO
? row.original.messages.length > 0
: Object.keys(row.original.messages).length > 0);
const hasResponse = row.original.response && Object.keys(formatData(row.original.response)).length > 0;
const missingData = !hasMessages && !hasResponse;
const missingData = !hasMessages && !hasResponse && !hasError;
// Format the response with error details if present
const formattedResponse = () => {