From c402db905725308948d3b4b8bfeb097e73f5c8a5 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 4 Apr 2025 17:07:43 -0700 Subject: [PATCH 1/4] prometheus emit llm provider on failure metric --- litellm/integrations/prometheus.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 5ac8c80eb3..205e1f0c6b 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -818,7 +818,7 @@ class PrometheusLogger(CustomLogger): requested_model=request_data.get("model", ""), status_code=str(getattr(original_exception, "status_code", None)), exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) _labels = prometheus_label_factory( @@ -917,7 +917,7 @@ class PrometheusLogger(CustomLogger): api_base=api_base, api_provider=llm_provider, exception_status=str(getattr(exception, "status_code", None)), - exception_class=exception.__class__.__name__, + exception_class=self._get_exception_class_name(exception), requested_model=model_group, hashed_api_key=standard_logging_payload["metadata"][ "user_api_key_hash" @@ -1146,6 +1146,20 @@ class PrometheusLogger(CustomLogger): ) return + @staticmethod + def _get_exception_class_name(exception: Exception) -> str: + exception_class_name = getattr(exception, "llm_provider") or "" + + # pretty print the provider name on prometheus + # eg. `openai` -> `Openai.` + if len(exception_class_name) >= 1: + exception_class_name = ( + exception_class_name[0].upper() + exception_class_name[1:] + "." + ) + + exception_class_name += exception.__class__.__name__ + return exception_class_name + async def log_success_fallback_event( self, original_model_group: str, kwargs: dict, original_exception: Exception ): @@ -1181,7 +1195,7 @@ class PrometheusLogger(CustomLogger): team=standard_metadata["user_api_key_team_id"], team_alias=standard_metadata["user_api_key_team_alias"], exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) _labels = prometheus_label_factory( @@ -1225,7 +1239,7 @@ class PrometheusLogger(CustomLogger): team=standard_metadata["user_api_key_team_id"], team_alias=standard_metadata["user_api_key_team_alias"], exception_status=str(getattr(original_exception, "status_code", None)), - exception_class=str(original_exception.__class__.__name__), + exception_class=self._get_exception_class_name(original_exception), tags=_tags, ) From f402e9bbd1942a182f962da9f09e1138bb343e6f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 4 Apr 2025 21:23:21 -0700 Subject: [PATCH 2/4] _get_exception_class_name --- litellm/integrations/prometheus.py | 4 +++- tests/otel_tests/test_prometheus.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 205e1f0c6b..cfa7c368ce 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -1148,7 +1148,9 @@ class PrometheusLogger(CustomLogger): @staticmethod def _get_exception_class_name(exception: Exception) -> str: - exception_class_name = getattr(exception, "llm_provider") or "" + exception_class_name = "" + if hasattr(exception, "llm_provider"): + exception_class_name = getattr(exception, "llm_provider") or "" # pretty print the provider name on prometheus # eg. `openai` -> `Openai.` diff --git a/tests/otel_tests/test_prometheus.py b/tests/otel_tests/test_prometheus.py index 932ae0bbe7..9cae5c565f 100644 --- a/tests/otel_tests/test_prometheus.py +++ b/tests/otel_tests/test_prometheus.py @@ -107,7 +107,7 @@ async def test_proxy_failure_metrics(): print("/metrics", metrics) # Check if the failure metric is present and correct - expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0' + expected_metric = 'litellm_proxy_failed_requests_metric_total{api_key_alias="None",end_user="None",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None",user="default_user_id"} 1.0' assert ( expected_metric in metrics @@ -121,7 +121,7 @@ async def test_proxy_failure_metrics(): ) assert ( - 'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}' + 'litellm_deployment_failure_responses_total{api_base="https://exampleopenaiendpoint-production.up.railway.app",api_key_alias="None",api_provider="openai",exception_class="Openai.RateLimitError",exception_status="429",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",litellm_model_name="429",model_id="7499d31f98cd518cf54486d5a00deda6894239ce16d13543398dc8abf870b15f",requested_model="fake-azure-endpoint",team="None",team_alias="None"}' in metrics ) @@ -229,13 +229,13 @@ async def test_proxy_fallback_metrics(): # Check if successful fallback metric is incremented assert ( - 'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' + 'litellm_deployment_successful_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="fake-openai-endpoint",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' in metrics ) # Check if failed fallback metric is incremented assert ( - 'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' + 'litellm_deployment_failed_fallbacks_total{api_key_alias="None",exception_class="Openai.RateLimitError",exception_status="429",fallback_model="unknown-model",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",requested_model="fake-azure-endpoint",team="None",team_alias="None"} 1.0' in metrics ) From df4593d58bf5f3047061a3ce7ece2fb89900f3fa Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 4 Apr 2025 21:30:05 -0700 Subject: [PATCH 3/4] test prom unit tests --- tests/logging_callback_tests/test_prometheus_unit_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py index 6bc5b42c45..c24bb27691 100644 --- a/tests/logging_callback_tests/test_prometheus_unit_tests.py +++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py @@ -713,7 +713,7 @@ async def test_async_post_call_failure_hook(prometheus_logger): team_alias="test_team_alias", user="test_user", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_proxy_failed_requests_metric.labels().inc.assert_called_once() @@ -948,7 +948,7 @@ async def test_log_success_fallback_event(prometheus_logger): team="test_team", team_alias="test_team_alias", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_deployment_successful_fallbacks.labels().inc.assert_called_once() @@ -985,7 +985,7 @@ async def test_log_failure_fallback_event(prometheus_logger): team="test_team", team_alias="test_team_alias", exception_status="429", - exception_class="RateLimitError", + exception_class="Openai.RateLimitError", ) prometheus_logger.litellm_deployment_failed_fallbacks.labels().inc.assert_called_once() From b7cd4cef07b789a1bf59c1a922aae775f5d6614c Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 4 Apr 2025 21:32:55 -0700 Subject: [PATCH 4/4] test_get_exception_class_name --- .../test_prometheus_unit_tests.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py index c24bb27691..ddfce710d7 100644 --- a/tests/logging_callback_tests/test_prometheus_unit_tests.py +++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py @@ -1500,3 +1500,33 @@ def test_set_team_budget_metrics_with_custom_labels(prometheus_logger, monkeypat "metadata_organization": None, "metadata_environment": None, } + + +def test_get_exception_class_name(prometheus_logger): + """ + Test that _get_exception_class_name correctly formats the exception class name + """ + # Test case 1: Exception with llm_provider + rate_limit_error = litellm.RateLimitError( + message="Rate limit exceeded", + llm_provider="openai", + model="gpt-3.5-turbo" + ) + assert prometheus_logger._get_exception_class_name(rate_limit_error) == "Openai.RateLimitError" + + # Test case 2: Exception with empty llm_provider + auth_error = litellm.AuthenticationError( + message="Invalid API key", + llm_provider="", + model="gpt-4" + ) + assert prometheus_logger._get_exception_class_name(auth_error) == "AuthenticationError" + + # Test case 3: Exception with None llm_provider + context_window_error = litellm.ContextWindowExceededError( + message="Context length exceeded", + llm_provider=None, + model="gpt-4" + ) + assert prometheus_logger._get_exception_class_name(context_window_error) == "ContextWindowExceededError" +