From f028a622e2f99d83a67410166532d975bb075ccf Mon Sep 17 00:00:00 2001 From: Mateo Wang <277851410+mateo-berri@users.noreply.github.com> Date: Wed, 13 May 2026 17:40:59 -0700 Subject: [PATCH] fix(prometheus): emit `litellm_remaining_tokens_metric` for Bedrock and Vertex (#27705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(prometheus): emit remaining_tokens/requests gauges for bedrock + vertex (LIT-2719) Bedrock and Vertex AI never return x-ratelimit-remaining-* response headers, so litellm_remaining_tokens_metric / litellm_remaining_requests_metric only fired for OpenAI / Azure / Anthropic deployments even when tpm/rpm was configured on the router. Add a provider-agnostic fallback in PrometheusLogger.async_log_success_event that asks Router.get_remaining_model_group_usage() for the same model_group and emits the gauges with configured_limit - current_usage when the upstream provider didn't populate the headers itself. Existing OpenAI / Azure / Anthropic flows are unchanged because the fallback short-circuits when both header values are already present. Tests: 8 new tests covering bedrock + vertex emission, header short-circuit, partial-header fill, llm_router=None, missing model_group, empty router result, and router exception swallowing. Co-authored-by: Mateo Wang * fix(prometheus): narrow except to ImportError, log router lookup failures via verbose_logger.exception Address greptile review: - The optional 'from litellm.proxy.proxy_server import llm_router' should guard against ImportError specifically, not all exceptions, so that unexpected errors (e.g. AttributeError from partially-initialized state) stay visible. - get_remaining_model_group_usage failures are now logged via verbose_logger.exception (with traceback) instead of debug, matching the PR description's intent and avoiding silent loss of router-cache errors in production. Co-authored-by: Mateo Wang * fix(prometheus): subtract in-flight delta in router-remaining fallback The router's TPM/RPM counter is incremented by Router.deployment_callback_on_success, which fires alongside this prometheus callback in the success-log fan-out. Prometheus wins the race, so get_remaining_model_group_usage returns the pre-decrement counter for the current request — while vendor headers (OpenAI/Anthropic/Azure) are already post-decrement. That broke parity between providers on the same gauge: dashboards plotting litellm_remaining_requests_metric showed Bedrock/Vertex perpetually one request behind Anthropic for the same throughput. Replay the in-flight increment before emit: subtract total_tokens from remaining_tokens and 1 from remaining_requests. * Revert "fix(prometheus): subtract in-flight delta in router-remaining fallback" This reverts commit 001ce95ecdd952b4b5a23dd2b1e62c4562c932bc. * fix(router): post-decrement router-derived ratelimit headers Router.set_response_headers injects x-ratelimit-remaining-{tokens, requests} for providers that don't return them natively (Bedrock, Vertex). The values come from get_remaining_model_group_usage, which reads the router's TPM/RPM counter — incremented post-response by deployment_callback_on_success. So the headers reflected the counter state before the current request was counted: pre-decrement. Vendor headers from OpenAI/Anthropic/Azure are post-decrement (the vendor counted the request before responding). Same metric name, two semantics — dashboards plotting litellm_remaining_requests_metric showed Bedrock/Vertex perpetually one request behind for the same throughput, and the HTTP response headers exposed the same skew to clients. Subtract the in-flight delta before writing: 1 from remaining-requests, response.usage.total_tokens from remaining-tokens. Fixes both the response headers and (transitively) the prometheus gauges that read from standard_logging_payload.additional_headers. --------- Co-authored-by: cursor Co-authored-by: Mateo Wang --- litellm/integrations/prometheus.py | 104 ++++++ litellm/router.py | 22 +- .../test_router_helper_utils.py | 73 +++++ ...etheus_remaining_tokens_router_fallback.py | 298 ++++++++++++++++++ 4 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 tests/test_litellm/integrations/test_prometheus_remaining_tokens_router_fallback.py diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index f9b1c66643..30af0dcb8e 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -1226,6 +1226,17 @@ class PrometheusLogger(CustomLogger): label_context=label_context, ) + # Provider-agnostic fallback: providers like Bedrock and Vertex don't return + # x-ratelimit-remaining-* headers, so the gauges above only fire for OpenAI / + # Anthropic / Azure. When the proxy router has tpm/rpm configured for the + # model_group, derive remaining from configured-limit minus current usage so + # the same metric is populated for any provider. + await self._async_set_router_remaining_metrics( + standard_logging_payload=standard_logging_payload, # type: ignore + enum_values=enum_values, + label_context=label_context, + ) + # cache metrics self._increment_cache_metrics( standard_logging_payload=standard_logging_payload, # type: ignore @@ -2199,6 +2210,99 @@ class PrometheusLogger(CustomLogger): ) self.litellm_deployment_rpm_limit.labels(**_labels).set(rpm) + async def _async_set_router_remaining_metrics( + self, + standard_logging_payload: StandardLoggingPayload, + enum_values: UserAPIKeyLabelValues, + label_context: Optional[PrometheusLabelFactoryContext] = None, + ) -> None: + """ + Populate ``litellm_remaining_tokens_metric`` / + ``litellm_remaining_requests_metric`` from the router's internal usage + counters when the upstream provider did not return + ``x-ratelimit-remaining-*`` response headers. + + OpenAI / Anthropic / Azure return remaining tokens/requests in response + headers, but Bedrock and Vertex AI do not. This fallback computes + ``configured_limit - current_usage`` via + ``Router.get_remaining_model_group_usage`` so the same gauges are + emitted for every provider when tpm/rpm is configured on the + deployment. + """ + try: + additional_headers = ( + standard_logging_payload.get("hidden_params", {}) or {} + ).get("additional_headers") or {} + + already_have_tokens = ( + additional_headers.get("x_ratelimit_remaining_tokens") is not None + ) + already_have_requests = ( + additional_headers.get("x_ratelimit_remaining_requests") is not None + ) + if already_have_tokens and already_have_requests: + return + + model_group = standard_logging_payload.get("model_group") + if not model_group: + return + + try: + from litellm.proxy.proxy_server import llm_router + except ImportError: + llm_router = None + + if llm_router is None: + return + + try: + remaining_usage = await llm_router.get_remaining_model_group_usage( + model_group + ) + except Exception as e: + verbose_logger.exception( + "Prometheus: get_remaining_model_group_usage failed for " + "model_group=%s: %s", + model_group, + e, + ) + return + + if not remaining_usage: + return + + remaining_tokens = remaining_usage.get("x-ratelimit-remaining-tokens") + remaining_requests = remaining_usage.get("x-ratelimit-remaining-requests") + + if not already_have_tokens and remaining_tokens is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_remaining_tokens_metric" + ), + enum_values=enum_values, + label_context=label_context, + ) + self.litellm_remaining_tokens_metric.labels(**_labels).set( + remaining_tokens + ) + + if not already_have_requests and remaining_requests is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_remaining_requests_metric" + ), + enum_values=enum_values, + label_context=label_context, + ) + self.litellm_remaining_requests_metric.labels(**_labels).set( + remaining_requests + ) + except Exception as e: + verbose_logger.exception( + "Prometheus Error: _async_set_router_remaining_metrics. " + "Exception occured - {}".format(str(e)) + ) + def set_llm_deployment_success_metrics( self, request_kwargs: dict, diff --git a/litellm/router.py b/litellm/router.py index 5e30ae618c..1d070b3af8 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -8771,9 +8771,29 @@ class Router: model_group ) + # get_remaining_model_group_usage reads the router's TPM/RPM + # counter, which is incremented post-response by + # deployment_callback_on_success. So the values returned here + # are pre-decrement for the current request, while vendor + # headers (OpenAI/Anthropic/Azure) are post-decrement. Replay + # the in-flight increment so router-derived headers match + # vendor-derived semantics — for both the HTTP response sent + # to the client and the prometheus gauges that read these + # headers downstream (LIT-2719). + in_flight_tokens = 0 + usage = getattr(response, "usage", None) + if usage is not None: + in_flight_tokens = getattr(usage, "total_tokens", 0) or 0 + in_flight_delta = { + "x-ratelimit-remaining-tokens": in_flight_tokens, + "x-ratelimit-remaining-requests": 1, + } + for header, value in remaining_usage.items(): if value is not None: - additional_headers[header] = value + additional_headers[header] = value - in_flight_delta.get( + header, 0 + ) return response def _build_model_name_index(self, model_list: list) -> None: diff --git a/tests/router_unit_tests/test_router_helper_utils.py b/tests/router_unit_tests/test_router_helper_utils.py index 9e37863d23..59f2d1638e 100644 --- a/tests/router_unit_tests/test_router_helper_utils.py +++ b/tests/router_unit_tests/test_router_helper_utils.py @@ -879,6 +879,79 @@ async def test_set_response_headers(model_list): assert resp is None +@pytest.mark.asyncio +async def test_set_response_headers_subtracts_in_flight_delta(model_list): + """ + LIT-2719: router-derived `x-ratelimit-remaining-*` headers must be + post-decrement (match OpenAI/Anthropic vendor semantics) so the proxy's + HTTP response headers and the prometheus gauges that read them stay + comparable across providers. + + Router's TPM/RPM counter is incremented post-response by + `deployment_callback_on_success`, so `get_remaining_model_group_usage` + sees pre-decrement values. `set_response_headers` must replay the + in-flight increment before writing the headers. + """ + from pydantic import BaseModel + + class _Usage(BaseModel): + total_tokens: int = 42 + + class _Resp(BaseModel): + usage: _Usage = _Usage() + _hidden_params: dict = {} + + router = Router(model_list=model_list) + router.get_remaining_model_group_usage = AsyncMock( + return_value={ + "x-ratelimit-remaining-tokens": 1000, + "x-ratelimit-limit-tokens": 1000, + "x-ratelimit-remaining-requests": 100, + "x-ratelimit-limit-requests": 100, + } + ) + + resp = _Resp() + resp._hidden_params = {} + await router.set_response_headers(response=resp, model_group="gpt-3.5-turbo") + + headers = resp._hidden_params["additional_headers"] + assert headers["x-ratelimit-remaining-tokens"] == 958 + assert headers["x-ratelimit-remaining-requests"] == 99 + # Limit headers pass through unmodified. + assert headers["x-ratelimit-limit-tokens"] == 1000 + assert headers["x-ratelimit-limit-requests"] == 100 + + +@pytest.mark.asyncio +async def test_set_response_headers_handles_missing_usage(model_list): + """ + Streaming chunks and some response shapes may lack a `usage` attribute or + populated `total_tokens`. The in-flight subtraction must default to 0 + tokens (still subtract 1 from requests) and never raise. + """ + from pydantic import BaseModel + + class _Resp(BaseModel): + _hidden_params: dict = {} + + router = Router(model_list=model_list) + router.get_remaining_model_group_usage = AsyncMock( + return_value={ + "x-ratelimit-remaining-tokens": 1000, + "x-ratelimit-remaining-requests": 100, + } + ) + + resp = _Resp() + resp._hidden_params = {} + await router.set_response_headers(response=resp, model_group="gpt-3.5-turbo") + + headers = resp._hidden_params["additional_headers"] + assert headers["x-ratelimit-remaining-tokens"] == 1000 + assert headers["x-ratelimit-remaining-requests"] == 99 + + def test_get_all_deployments(model_list): """Test if the 'get_all_deployments' function is working correctly""" router = Router(model_list=model_list) diff --git a/tests/test_litellm/integrations/test_prometheus_remaining_tokens_router_fallback.py b/tests/test_litellm/integrations/test_prometheus_remaining_tokens_router_fallback.py new file mode 100644 index 0000000000..d754de8656 --- /dev/null +++ b/tests/test_litellm/integrations/test_prometheus_remaining_tokens_router_fallback.py @@ -0,0 +1,298 @@ +""" +LIT-2719 — `litellm_remaining_tokens_metric` and +`litellm_remaining_requests_metric` only fired for providers that return +`x-ratelimit-remaining-*` response headers (OpenAI, Azure, Anthropic). + +This guarded the gauges behind a provider-specific code path, so Bedrock and +Vertex deployments — which never populate those headers — silently produced no +data even when the proxy router had `tpm`/`rpm` configured. + +`_async_set_router_remaining_metrics` adds a provider-agnostic fallback that +asks `Router.get_remaining_model_group_usage` for the same model_group and +emits the gauges with `configured_limit - current_usage`. + +Tests cover: +- Bedrock fallback emits both gauges. +- Vertex AI fallback emits both gauges. +- Already-present headers short-circuit the router lookup entirely. +- Partial header coverage (only requests) still triggers the missing tokens + gauge. +- llm_router unavailable / model_group missing / router raises → silent no-op. +""" + +import os +import sys +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from prometheus_client import REGISTRY + +sys.path.insert(0, os.path.abspath("../../..")) + +from litellm.integrations.prometheus import PrometheusLogger +from litellm.types.integrations.prometheus import UserAPIKeyLabelValues + + +@pytest.fixture(scope="function") +def prometheus_logger(): + collectors = list(REGISTRY._collector_to_names.keys()) + for collector in collectors: + REGISTRY.unregister(collector) + return PrometheusLogger() + + +def _build_payload( + model_group: str = "bedrock-claude-group", + custom_llm_provider: str = "bedrock", + additional_headers: dict | None = None, +): + return { + "model_group": model_group, + "custom_llm_provider": custom_llm_provider, + "model": "anthropic.claude-3-sonnet-20240229-v1:0", + "model_id": "deployment-id-1", + "api_base": "https://bedrock-runtime.us-east-1.amazonaws.com", + "hidden_params": { + "additional_headers": additional_headers or {}, + }, + "metadata": { + "user_api_key_hash": "test-key", + "user_api_key_alias": None, + "user_api_key_team_id": None, + "user_api_key_team_alias": None, + }, + } + + +def _enum_values(model_group: str = "bedrock-claude-group"): + return UserAPIKeyLabelValues( + end_user=None, + hashed_api_key="test-key", + api_key_alias=None, + team=None, + team_alias=None, + requested_model=model_group, + model_group=model_group, + model_id="deployment-id-1", + api_base="https://bedrock-runtime.us-east-1.amazonaws.com", + api_provider="bedrock", + litellm_model_name="anthropic.claude-3-sonnet-20240229-v1:0", + ) + + +class TestRouterFallbackEmitsForBedrock: + @pytest.mark.asyncio + async def test_should_emit_both_gauges_for_bedrock_when_router_has_limits( + self, prometheus_logger + ): + payload = _build_payload(custom_llm_provider="bedrock") + enum_values = _enum_values() + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock( + return_value={ + "x-ratelimit-remaining-tokens": 75, + "x-ratelimit-limit-tokens": 100, + "x-ratelimit-remaining-requests": 9, + "x-ratelimit-limit-requests": 10, + } + ) + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=enum_values, + ) + + fake_router.get_remaining_model_group_usage.assert_awaited_once_with( + "bedrock-claude-group" + ) + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_called_once() + prometheus_logger.litellm_remaining_tokens_metric.labels().set.assert_called_once_with( + 75 + ) + prometheus_logger.litellm_remaining_requests_metric.labels.assert_called_once() + prometheus_logger.litellm_remaining_requests_metric.labels().set.assert_called_once_with( + 9 + ) + + +class TestRouterFallbackEmitsForVertex: + @pytest.mark.asyncio + async def test_should_emit_both_gauges_for_vertex_when_router_has_limits( + self, prometheus_logger + ): + payload = _build_payload( + model_group="vertex-gemini-group", + custom_llm_provider="vertex_ai", + ) + enum_values = _enum_values(model_group="vertex-gemini-group") + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock( + return_value={ + "x-ratelimit-remaining-tokens": 12345, + "x-ratelimit-remaining-requests": 50, + } + ) + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=enum_values, + ) + + fake_router.get_remaining_model_group_usage.assert_awaited_once_with( + "vertex-gemini-group" + ) + prometheus_logger.litellm_remaining_tokens_metric.labels().set.assert_called_once_with( + 12345 + ) + prometheus_logger.litellm_remaining_requests_metric.labels().set.assert_called_once_with( + 50 + ) + + +class TestExistingHeadersShortCircuit: + @pytest.mark.asyncio + async def test_should_skip_router_lookup_when_both_headers_already_present( + self, prometheus_logger + ): + payload = _build_payload( + additional_headers={ + "x_ratelimit_remaining_tokens": 999, + "x_ratelimit_remaining_requests": 99, + } + ) + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock() + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + fake_router.get_remaining_model_group_usage.assert_not_called() + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_not_called() + prometheus_logger.litellm_remaining_requests_metric.labels.assert_not_called() + + @pytest.mark.asyncio + async def test_should_only_fill_missing_dimension_when_one_header_present( + self, prometheus_logger + ): + payload = _build_payload( + additional_headers={ + "x_ratelimit_remaining_requests": 7, + } + ) + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock( + return_value={ + "x-ratelimit-remaining-tokens": 555, + "x-ratelimit-remaining-requests": 999, + } + ) + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + prometheus_logger.litellm_remaining_tokens_metric.labels().set.assert_called_once_with( + 555 + ) + prometheus_logger.litellm_remaining_requests_metric.labels.assert_not_called() + + +class TestRouterFallbackDefensivePaths: + @pytest.mark.asyncio + async def test_should_noop_when_llm_router_is_none(self, prometheus_logger): + payload = _build_payload() + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", None, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_not_called() + prometheus_logger.litellm_remaining_requests_metric.labels.assert_not_called() + + @pytest.mark.asyncio + async def test_should_noop_when_model_group_missing(self, prometheus_logger): + payload = _build_payload() + payload["model_group"] = None + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock() + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + fake_router.get_remaining_model_group_usage.assert_not_called() + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_not_called() + + @pytest.mark.asyncio + async def test_should_noop_when_router_returns_empty_dict(self, prometheus_logger): + payload = _build_payload() + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock(return_value={}) + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_not_called() + prometheus_logger.litellm_remaining_requests_metric.labels.assert_not_called() + + @pytest.mark.asyncio + async def test_should_swallow_router_exception(self, prometheus_logger): + payload = _build_payload() + + fake_router = MagicMock() + fake_router.get_remaining_model_group_usage = AsyncMock( + side_effect=RuntimeError("router boom") + ) + + prometheus_logger.litellm_remaining_tokens_metric = MagicMock() + prometheus_logger.litellm_remaining_requests_metric = MagicMock() + + with patch("litellm.proxy.proxy_server.llm_router", fake_router, create=True): + # Must not raise. + await prometheus_logger._async_set_router_remaining_metrics( + standard_logging_payload=payload, + enum_values=_enum_values(), + ) + + prometheus_logger.litellm_remaining_tokens_metric.labels.assert_not_called()