Files
litellm/tests/test_litellm/integrations/test_prometheus.py
T
Ishaan Jaff 66fafa3a7f [Feat] Polish - add better error validation when users configure prometheus metrics and labels to control cardinality (#12182)
* self._pretty_print_invalid_metric_error

* docs prometheus.md

* test prom validation checks

* update metric name

* fix _pretty_print_validation_errors

* fix linting

* test prometheus

* test fixes - prometheus
2025-07-01 20:17:17 -07:00

837 lines
29 KiB
Python

"""
Mock prometheus unit tests, these don't rely on LLM API calls
"""
import json
import os
import sys
import pytest
from fastapi.testclient import TestClient
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
from unittest.mock import patch
import pytest_asyncio
from apscheduler.schedulers.asyncio import AsyncIOScheduler
# Add prometheus_client import for registry cleanup
from prometheus_client import REGISTRY
import litellm
from litellm.constants import PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES
from litellm.integrations.prometheus import PrometheusLogger, prometheus_label_factory
from litellm.types.integrations.prometheus import (
PrometheusMetricLabels,
PrometheusMetricsConfig,
UserAPIKeyLabelValues,
)
@pytest.fixture
def prometheus_logger() -> PrometheusLogger:
"""
Fixture that creates a clean PrometheusLogger instance by clearing the registry first.
This prevents "Duplicated timeseries in CollectorRegistry" errors.
"""
collectors = list(REGISTRY._collector_to_names.keys())
for collector in collectors:
REGISTRY.unregister(collector)
return PrometheusLogger()
def clear_prometheus_registry():
"""Helper function to clear the Prometheus registry"""
collectors = list(REGISTRY._collector_to_names.keys())
for collector in collectors:
REGISTRY.unregister(collector)
def test_initialize_budget_metrics_cron_job():
# Clear registry before test
clear_prometheus_registry()
# Create a scheduler
scheduler = AsyncIOScheduler()
# Create and register a PrometheusLogger
prometheus_logger = PrometheusLogger()
litellm.callbacks = [prometheus_logger]
# Initialize the cron job
PrometheusLogger.initialize_budget_metrics_cron_job(scheduler)
# Verify that a job was added to the scheduler
jobs = scheduler.get_jobs()
assert len(jobs) == 1
# Verify job properties
job = jobs[0]
assert (
job.trigger.interval.total_seconds() / 60
== PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES
)
assert job.func.__name__ == "initialize_remaining_budget_metrics"
def test_end_user_not_tracked_for_all_prometheus_metrics():
"""
Test that end_user is not tracked for all Prometheus metrics by default.
This test ensures that:
1. By default, end_user is filtered out from all Prometheus metrics
2. Future metrics that include end_user in their label definitions will also be filtered
3. The filtering happens through the prometheus_label_factory function
"""
# Reset any previous settings
original_setting = getattr(
litellm, "enable_end_user_cost_tracking_prometheus_only", None
)
litellm.enable_end_user_cost_tracking_prometheus_only = None # Default behavior
try:
# Test data with end_user present
test_end_user_id = "test_user_123"
enum_values = UserAPIKeyLabelValues(
end_user=test_end_user_id,
hashed_api_key="test_key",
api_key_alias="test_alias",
team="test_team",
team_alias="test_team_alias",
user="test_user",
requested_model="gpt-4",
model="gpt-4",
litellm_model_name="gpt-4",
)
# Get all defined Prometheus metrics that include end_user in their labels
metrics_with_end_user = []
for metric_name in PrometheusMetricLabels.__dict__:
if not metric_name.startswith("_") and metric_name != "get_labels":
labels = getattr(PrometheusMetricLabels, metric_name)
if isinstance(labels, list) and "end_user" in labels:
metrics_with_end_user.append(metric_name)
# Ensure we found some metrics with end_user (sanity check)
assert (
len(metrics_with_end_user) > 0
), "No metrics with end_user found - test setup issue"
# Test each metric that includes end_user in its label definition
for metric_name in metrics_with_end_user:
supported_labels = PrometheusMetricLabels.get_labels(metric_name)
# Verify that end_user is in the supported labels (before filtering)
assert (
"end_user" in supported_labels
), f"end_user should be in {metric_name} labels"
# Call prometheus_label_factory to get filtered labels
filtered_labels = prometheus_label_factory(
supported_enum_labels=supported_labels, enum_values=enum_values
)
print("filtered labels logged on prometheus=", filtered_labels)
# Verify that end_user is None in the filtered labels (filtered out)
assert filtered_labels.get("end_user") is None, (
f"end_user should be None for metric {metric_name} when "
f"enable_end_user_cost_tracking_prometheus_only is not True. "
f"Got: {filtered_labels.get('end_user')}"
)
# Test that when enable_end_user_cost_tracking_prometheus_only is True, end_user is tracked
litellm.enable_end_user_cost_tracking_prometheus_only = True
# Test one metric to verify end_user is now included
test_metric = metrics_with_end_user[0]
supported_labels = PrometheusMetricLabels.get_labels(test_metric)
filtered_labels = prometheus_label_factory(
supported_enum_labels=supported_labels, enum_values=enum_values
)
# Now end_user should be present
assert filtered_labels.get("end_user") == test_end_user_id, (
f"end_user should be present for metric {test_metric} when "
f"enable_end_user_cost_tracking_prometheus_only is True"
)
finally:
# Restore original setting
litellm.enable_end_user_cost_tracking_prometheus_only = original_setting
def test_future_metrics_with_end_user_are_filtered():
"""
Test that ensures future metrics that include end_user will also be filtered.
This simulates adding a new metric with end_user in its labels.
"""
# Reset setting
original_setting = getattr(
litellm, "enable_end_user_cost_tracking_prometheus_only", None
)
litellm.enable_end_user_cost_tracking_prometheus_only = None
try:
# Simulate a new metric that includes end_user
simulated_new_metric_labels = [
"end_user",
"hashed_api_key",
"api_key_alias",
"model",
"team",
"new_label", # Some new label that might be added in the future
]
test_end_user_id = "future_test_user"
enum_values = UserAPIKeyLabelValues(
end_user=test_end_user_id,
hashed_api_key="test_key",
api_key_alias="test_alias",
team="test_team",
model="gpt-4",
)
# Test the filtering
filtered_labels = prometheus_label_factory(
supported_enum_labels=simulated_new_metric_labels, enum_values=enum_values
)
print("filtered labels logged on prometheus=", filtered_labels)
# Verify end_user is filtered out even for this "new" metric
assert (
filtered_labels.get("end_user") is None
), "end_user should be filtered out for future metrics by default"
# Verify other labels are present
assert filtered_labels.get("hashed_api_key") == "test_key"
assert filtered_labels.get("team") == "test_team"
finally:
# Restore original setting
litellm.enable_end_user_cost_tracking_prometheus_only = original_setting
def test_prometheus_config_parsing():
"""Test that prometheus metrics configuration is parsed correctly"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration
test_config = [
{
"group": "service_metrics",
"metrics": [
"litellm_deployment_failure_responses",
"litellm_deployment_total_requests",
"litellm_proxy_failed_requests_metric",
"litellm_proxy_total_requests_metric",
],
"include_labels": [
"requested_model",
"team",
],
}
]
# Set configuration
litellm.prometheus_metrics_config = test_config
# Create PrometheusLogger instance
logger = PrometheusLogger()
# Parse configuration
label_filters = logger._parse_prometheus_config()
# Verify label filters exist for each metric
expected_labels = [
"requested_model",
"team",
]
expected_metrics = [
"litellm_deployment_failure_responses",
"litellm_deployment_total_requests",
"litellm_proxy_failed_requests_metric",
"litellm_proxy_total_requests_metric",
]
for metric in expected_metrics:
assert metric in label_filters
assert label_filters[metric] == expected_labels
def test_get_metric_labels():
"""Test that metric label filtering works correctly"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration
test_config = [
{
"group": "service_metrics",
"metrics": ["litellm_deployment_failure_responses"],
"include_labels": ["litellm_model_name", "api_provider"],
}
]
litellm.prometheus_metrics_config = test_config
logger = PrometheusLogger()
# Get filtered labels
labels = logger.get_labels_for_metric("litellm_deployment_failure_responses")
# Verify only configured labels are returned
assert "litellm_model_name" in labels
assert "api_provider" in labels
# These should be filtered out even if they're in the default labels
assert (
len([l for l in labels if l not in ["litellm_model_name", "api_provider"]]) == 0
)
def test_no_prometheus_config():
"""Test behavior when no prometheus config is set"""
# Clear registry before test
clear_prometheus_registry()
# Clear any existing config
litellm.prometheus_metrics_config = None
logger = PrometheusLogger()
# Should return default labels when no config is set
labels = logger.get_labels_for_metric("litellm_deployment_failure_responses")
# Should return some labels (the default ones)
assert isinstance(labels, list)
# Should have more than 0 labels (the default ones)
assert len(labels) > 0
def test_prometheus_metrics_config_type():
"""Test that PrometheusMetricsConfig type validation works"""
# Valid configuration
valid_config = PrometheusMetricsConfig(
group="service_metrics",
metrics=["litellm_deployment_failure_responses"],
include_labels=["litellm_model_name"],
)
assert valid_config.group == "service_metrics"
assert valid_config.metrics == ["litellm_deployment_failure_responses"]
assert valid_config.include_labels == ["litellm_model_name"]
# Test with None include_labels (should be allowed)
config_no_labels = PrometheusMetricsConfig(
group="service_metrics",
metrics=["litellm_deployment_failure_responses"],
include_labels=None,
)
assert config_no_labels.include_labels is None
print("PrometheusMetricsConfig type validation passed!")
def test_basic_functionality():
"""Test basic functionality without creating multiple instances"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration
test_config = [
{
"group": "service_metrics",
"metrics": [
"litellm_deployment_failure_responses",
"litellm_deployment_total_requests",
],
"include_labels": ["litellm_model_name", "api_provider"],
}
]
# Set configuration
litellm.prometheus_metrics_config = test_config
# Test that the configuration is properly set
assert litellm.prometheus_metrics_config is not None
assert len(litellm.prometheus_metrics_config) == 1
assert litellm.prometheus_metrics_config[0]["group"] == "service_metrics"
assert (
"litellm_deployment_failure_responses"
in litellm.prometheus_metrics_config[0]["metrics"]
)
print("Basic prometheus configuration test passed!")
# ==============================================================================
# VALIDATION TESTS - Test the new validation logic for metrics and labels
# ==============================================================================
def test_invalid_metric_name_validation():
"""Test that invalid metric names are caught and raise ValueError"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration with invalid metric name
test_config = [
{
"group": "service_metrics",
"metrics": [
"invalid_metric_name_that_does_not_exist",
"litellm_deployment_total_requests", # valid metric
],
"include_labels": ["litellm_model_name"],
}
]
litellm.prometheus_metrics_config = test_config
# Creating PrometheusLogger should raise ValueError due to invalid metric
with pytest.raises(ValueError) as exc_info:
PrometheusLogger()
# Verify error message contains information about invalid metric
assert "invalid_metric_name_that_does_not_exist" in str(exc_info.value)
assert "Configuration validation failed" in str(exc_info.value)
def test_invalid_labels_validation():
"""Test that invalid labels for metrics are caught and raise ValueError"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration with invalid labels
test_config = [
{
"group": "service_metrics",
"metrics": ["litellm_deployment_total_requests"],
"include_labels": [
"litellm_model_name", # valid label
"invalid_label_name", # invalid label
"another_invalid_label", # another invalid label
],
}
]
litellm.prometheus_metrics_config = test_config
# Creating PrometheusLogger should raise ValueError due to invalid labels
with pytest.raises(ValueError) as exc_info:
PrometheusLogger()
# Verify error message contains information about invalid labels
assert "invalid_label_name" in str(exc_info.value)
assert "Configuration validation failed" in str(exc_info.value)
def test_valid_configuration_passes_validation():
"""Test that valid configuration passes validation without errors"""
# Clear registry before test
clear_prometheus_registry()
# Set up test configuration with all valid metrics and labels
test_config = [
{
"group": "service_metrics",
"metrics": [
"litellm_deployment_total_requests",
"litellm_deployment_failure_responses",
],
"include_labels": [
"litellm_model_name",
"api_provider",
"requested_model",
],
}
]
litellm.prometheus_metrics_config = test_config
# This should not raise any exceptions
try:
logger = PrometheusLogger()
# Verify the logger was created successfully
assert logger is not None
assert hasattr(logger, 'enabled_metrics')
assert 'litellm_deployment_total_requests' in logger.enabled_metrics
assert 'litellm_deployment_failure_responses' in logger.enabled_metrics
except Exception as e:
pytest.fail(f"Valid configuration should not raise exception: {e}")
# ==============================================================================
# END VALIDATION TESTS
# ==============================================================================
# ==============================================================================
# SEMANTIC VALIDATION TESTS - Detect logical errors in metric increments
# ==============================================================================
class MockCounter:
"""Mock counter for testing metric increments"""
def __init__(self, name):
self.name = name
self.labels_calls = []
self.inc_calls = []
def labels(self, *args, **kwargs):
self.labels_calls.append(kwargs)
return self
def inc(self, value=1):
self.inc_calls.append(value)
class MockHistogram:
"""Mock histogram for testing metric observations"""
def __init__(self, name):
self.name = name
self.labels_calls = []
self.observe_calls = []
def labels(self, *args, **kwargs):
self.labels_calls.append(kwargs)
return self
def observe(self, value):
self.observe_calls.append(value)
@pytest.fixture
def mock_prometheus_logger():
"""Create a PrometheusLogger with mocked metrics to test increment logic"""
from unittest.mock import patch
collectors = list(REGISTRY._collector_to_names.keys())
for collector in collectors:
REGISTRY.unregister(collector)
with patch("litellm.proxy.proxy_server.premium_user", True):
logger = PrometheusLogger()
# Replace metrics with mocks to capture increment calls
logger.litellm_proxy_total_requests_metric = MockCounter(
"litellm_proxy_total_requests_metric"
)
logger.litellm_tokens_metric = MockCounter("litellm_total_tokens")
logger.litellm_input_tokens_metric = MockCounter("litellm_input_tokens")
logger.litellm_output_tokens_metric = MockCounter("litellm_output_tokens")
logger.litellm_spend_metric = MockCounter("litellm_spend_metric")
logger.litellm_requests_metric = MockCounter("litellm_requests_metric")
return logger
@pytest.mark.asyncio
async def test_request_counter_semantic_validation(mock_prometheus_logger):
"""
CRITICAL TEST: Validates that request counters are incremented by 1, not by token count.
This test specifically catches the bug where litellm_proxy_total_requests_metric
is incorrectly incremented by total_tokens instead of 1.
"""
from datetime import datetime, timedelta
from unittest.mock import MagicMock
from litellm.proxy._types import UserAPIKeyAuth
# Test data with large token count that should NOT affect request counter
kwargs = {
"model": "gpt-3.5-turbo",
"litellm_params": {"metadata": {}},
"start_time": datetime.now() - timedelta(seconds=1),
"end_time": datetime.now(),
"api_call_start_time": datetime.now() - timedelta(seconds=0.5),
"standard_logging_object": {
"total_tokens": 999, # Large number - this should NOT be used for request counter
"prompt_tokens": 600,
"completion_tokens": 399,
"response_cost": 0.005,
"model_group": "gpt-3.5-turbo",
"model_id": "test-model-id",
"api_base": "https://api.openai.com/v1",
"custom_llm_provider": "openai",
"stream": False,
"request_tags": [],
"metadata": {
"user_api_key_user_id": "test-user",
"user_api_key_hash": "test-hash",
"user_api_key_alias": "test-alias",
"user_api_key_team_id": "test-team",
"user_api_key_team_alias": "test-team-alias",
"user_api_key_user_email": "test@example.com",
},
"hidden_params": {
"additional_headers": {},
},
},
}
# Call the success event
await mock_prometheus_logger.async_log_success_event(
kwargs, None, kwargs["start_time"], kwargs["end_time"]
)
# CRITICAL ASSERTION: Request counter should not be incremented
total_requests_metric = mock_prometheus_logger.litellm_proxy_total_requests_metric
assert (
len(total_requests_metric.inc_calls) == 0
), "Request metric should not be incremented"
# Call the post-call logging hook
await mock_prometheus_logger.async_post_call_success_hook(
data={},
user_api_key_dict=UserAPIKeyAuth(
end_user="test-user",
hashed_api_key="test-hash",
api_key_alias="test-alias",
team="test-team",
model="gpt-4",
),
response=MagicMock(),
)
# CRITICAL ASSERTION: Request counter be incremented by 1
total_requests_metric = mock_prometheus_logger.litellm_proxy_total_requests_metric
assert (
len(total_requests_metric.inc_calls) == 1
), "Request metric should not be incremented"
# Check that ALL request counter increments are by 1 (not by token count)
for inc_value in total_requests_metric.inc_calls:
assert inc_value == 1, (
f"SEMANTIC BUG DETECTED: Request counter incremented by {inc_value} instead of 1. "
f"This indicates the bug where request counters are incremented by token counts."
)
# Verify token counters ARE incremented by token counts (this should work correctly)
tokens_metric = mock_prometheus_logger.litellm_tokens_metric
assert (
999 in tokens_metric.inc_calls
), "Token metric should be incremented by total_tokens (999)"
@pytest.mark.asyncio
async def test_multiple_requests_counter_semantics(mock_prometheus_logger):
"""
Test that demonstrates the scaling issue: with multiple requests,
request counters should scale by number of requests, not total tokens.
"""
from datetime import datetime, timedelta
num_requests = 3
tokens_per_request = 500 # High token count to make the bug obvious
for i in range(num_requests):
kwargs = {
"model": "gpt-3.5-turbo",
"litellm_params": {"metadata": {}},
"start_time": datetime.now() - timedelta(seconds=1),
"end_time": datetime.now(),
"api_call_start_time": datetime.now() - timedelta(seconds=0.5),
"standard_logging_object": {
"total_tokens": tokens_per_request,
"prompt_tokens": tokens_per_request // 2,
"completion_tokens": tokens_per_request // 2,
"response_cost": 0.001,
"model_group": "gpt-3.5-turbo",
"model_id": "test-model-id",
"api_base": "https://api.openai.com/v1",
"custom_llm_provider": "openai",
"stream": False,
"request_tags": [],
"metadata": {
"user_api_key_user_id": "test-user",
"user_api_key_hash": "test-hash",
"user_api_key_alias": "test-alias",
"user_api_key_team_id": "test-team",
"user_api_key_team_alias": "test-team-alias",
"user_api_key_user_email": "test@example.com",
},
"hidden_params": {
"additional_headers": {},
},
},
}
await mock_prometheus_logger.async_log_success_event(
kwargs, None, kwargs["start_time"], kwargs["end_time"]
)
# Calculate total increments
total_request_increments = sum(
mock_prometheus_logger.litellm_proxy_total_requests_metric.inc_calls
)
total_token_increments = sum(mock_prometheus_logger.litellm_tokens_metric.inc_calls)
# CRITICAL ASSERTION: Request increments should equal number of requests
expected_total_tokens = num_requests * tokens_per_request # 3 * 500 = 1500
# With the bug, total_request_increments would be 1500 instead of 3
assert total_request_increments == 0, (
f"SEMANTIC BUG: Request counter total increments = 0, "
f"expected {num_requests}. This suggests request counters are being incremented "
f"by token counts instead of request counts."
)
# Token counter should correctly equal total tokens
assert (
total_token_increments == expected_total_tokens
), f"Token counter should sum to {expected_total_tokens}, got {total_token_increments}"
@pytest.mark.asyncio
async def test_streaming_request_counter_semantics(mock_prometheus_logger):
"""
Test that streaming requests are also counted correctly (by 1, not by token count)
"""
from datetime import datetime, timedelta
kwargs = {
"model": "gpt-3.5-turbo",
"litellm_params": {"metadata": {}},
"start_time": datetime.now() - timedelta(seconds=1),
"end_time": datetime.now(),
"api_call_start_time": datetime.now() - timedelta(seconds=0.5),
"standard_logging_object": {
"total_tokens": 750, # High token count for streaming
"prompt_tokens": 300,
"completion_tokens": 450,
"response_cost": 0.003,
"model_group": "gpt-3.5-turbo",
"model_id": "test-model-id",
"api_base": "https://api.openai.com/v1",
"custom_llm_provider": "openai",
"stream": True, # This is a streaming request
"request_tags": [],
"metadata": {
"user_api_key_user_id": "test-user",
"user_api_key_hash": "test-hash",
"user_api_key_alias": "test-alias",
"user_api_key_team_id": "test-team",
"user_api_key_team_alias": "test-team-alias",
"user_api_key_user_email": "test@example.com",
},
"hidden_params": {
"additional_headers": {},
},
},
}
await mock_prometheus_logger.async_log_success_event(
kwargs, None, kwargs["start_time"], kwargs["end_time"]
)
# Streaming requests should also be counted as 1 request, not 750
for (
inc_value
) in mock_prometheus_logger.litellm_proxy_total_requests_metric.inc_calls:
assert (
inc_value == 1
), f"SEMANTIC BUG: Streaming request counter incremented by {inc_value} instead of 1"
def test_metric_increment_invariants():
"""
Test invariants that should always hold for different metric types
"""
# Invariant 1: Request counters should never be incremented by large values
suspicious_request_increments = [
100,
500,
1000,
1500,
] # These look like token counts
for increment in suspicious_request_increments:
# If we see request counters incremented by these values, it's likely a bug
assert (
increment > 10
), f"Request increment of {increment} is suspiciously large - likely a semantic bug"
# Invariant 2: Token counters should never be incremented by 1 (unless it's a 1-token response)
# This would indicate the reverse bug (using request count for token counter)
# Invariant 3: Cost increments should be small positive floats
reasonable_costs = [0.001, 0.01, 0.1, 1.0]
for cost in reasonable_costs:
assert 0 < cost < 100, f"Cost {cost} should be in reasonable range"
def test_token_counter_semantics():
"""
Test that token counters should be incremented by actual token values, not by 1
"""
# These are correct patterns for token counters
correct_token_increments = [50, 100, 250, 500, 1000, 2000]
for tokens in correct_token_increments:
# Token counters should be incremented by actual token counts
assert tokens > 1, f"Token increment of {tokens} is reasonable"
# These would be incorrect for token counters (suggests using request count for tokens)
incorrect_token_increments = [1] # Unless it's actually a 1-token response
# This test documents the expected behavior - token counters should use token values
@pytest.mark.asyncio
async def test_spend_counter_semantics(mock_prometheus_logger):
"""
Test that spend counters are incremented by cost amounts, not by 1 or token counts
"""
from datetime import datetime, timedelta
kwargs = {
"model": "gpt-3.5-turbo",
"litellm_params": {"metadata": {}},
"start_time": datetime.now() - timedelta(seconds=1),
"end_time": datetime.now(),
"api_call_start_time": datetime.now() - timedelta(seconds=0.5),
"standard_logging_object": {
"total_tokens": 100,
"prompt_tokens": 60,
"completion_tokens": 40,
"response_cost": 0.0015, # This should be used for spend metrics
"model_group": "gpt-3.5-turbo",
"model_id": "test-model-id",
"api_base": "https://api.openai.com/v1",
"custom_llm_provider": "openai",
"stream": False,
"request_tags": [],
"metadata": {
"user_api_key_user_id": "test-user",
"user_api_key_hash": "test-hash",
"user_api_key_alias": "test-alias",
"user_api_key_team_id": "test-team",
"user_api_key_team_alias": "test-team-alias",
"user_api_key_user_email": "test@example.com",
},
"hidden_params": {
"additional_headers": {},
},
},
}
await mock_prometheus_logger.async_log_success_event(
kwargs, None, kwargs["start_time"], kwargs["end_time"]
)
# Verify spend counter is incremented by cost amount
spend_metric = mock_prometheus_logger.litellm_spend_metric
assert len(spend_metric.inc_calls) > 0, "Spend metric should be incremented"
assert (
0.0015 in spend_metric.inc_calls
), "Spend metric should be incremented by response_cost (0.0015)"
# ==============================================================================
# END SEMANTIC VALIDATION TESTS
# ==============================================================================