Files
litellm/tests/logging_callback_tests/test_otel_logging.py
T
2026-04-17 13:02:59 -07:00

330 lines
10 KiB
Python

import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system-path
import pytest
import litellm
import asyncio
import logging
from opentelemetry import trace
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from litellm._logging import verbose_logger
from litellm.integrations.arize.arize_phoenix import ArizePhoenixLogger
from litellm.integrations._types.open_inference import (
OpenInferenceSpanKindValues,
SpanAttributes as OISpanAttributes,
)
from litellm.integrations.opentelemetry import (
LITELLM_PROXY_REQUEST_SPAN_NAME,
LITELLM_TRACER_NAME,
LITELLM_REQUEST_SPAN_NAME,
OpenTelemetry,
OpenTelemetryConfig,
RAW_REQUEST_SPAN_NAME,
Span,
)
from litellm.proxy._types import SpanAttributes
verbose_logger.setLevel(logging.DEBUG)
EXPECTED_SPAN_NAMES = ["litellm_request", "raw_gen_ai_request"]
exporter = InMemorySpanExporter()
@pytest.mark.asyncio
@pytest.mark.parametrize("streaming", [True, False])
async def test_async_otel_callback(streaming):
litellm.set_verbose = True
# Clear exporter at the start to ensure clean state
exporter.clear()
litellm.callbacks = [OpenTelemetry(config=OpenTelemetryConfig(exporter=exporter))]
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi"}],
temperature=0.1,
user="OTEL_USER",
stream=streaming,
)
if streaming is True:
async for chunk in response:
print("chunk", chunk)
await asyncio.sleep(4)
spans = exporter.get_finished_spans()
print("spans", spans)
assert len(spans) == 2
_span_names = [span.name for span in spans]
print("recorded span names", _span_names)
assert set(_span_names) == set(EXPECTED_SPAN_NAMES)
# print the value of a span
for span in spans:
print("span name", span.name)
print("span attributes", span.attributes)
if span.name == "litellm_request":
validate_litellm_request(span)
# Additional specific checks
assert span._attributes["gen_ai.request.model"] == "gpt-3.5-turbo"
assert span._attributes["gen_ai.system"] == "openai"
assert span._attributes["gen_ai.request.temperature"] == 0.1
assert span._attributes["llm.is_streaming"] == str(streaming)
assert span._attributes["llm.user"] == "OTEL_USER"
elif span.name == "raw_gen_ai_request":
if streaming is True:
validate_raw_gen_ai_request_openai_streaming(span)
else:
validate_raw_gen_ai_request_openai_non_streaming(span)
# clear in memory exporter
exporter.clear()
def validate_litellm_request(span):
expected_attributes = [
"gen_ai.request.model",
"gen_ai.system",
"gen_ai.request.temperature",
"llm.is_streaming",
"llm.user",
"gen_ai.response.id",
"gen_ai.response.model",
"gen_ai.usage.total_tokens",
"gen_ai.usage.output_tokens",
"gen_ai.usage.input_tokens",
]
# get the str of all the span attributes
print("span attributes", span._attributes)
for attr in expected_attributes:
value = span._attributes[attr]
print("value", value)
assert value is not None, f"Attribute {attr} has None value"
def validate_raw_gen_ai_request_openai_non_streaming(span):
expected_attributes = [
"llm.openai.messages",
"llm.openai.temperature",
"llm.openai.user",
"llm.openai.extra_body",
"llm.openai.id",
"llm.openai.choices",
"llm.openai.created",
"llm.openai.model",
"llm.openai.object",
"llm.openai.service_tier",
"llm.openai.system_fingerprint",
"llm.openai.usage",
]
print("span attributes", span._attributes)
for attr in span._attributes:
print(attr)
for attr in expected_attributes:
assert span._attributes[attr] is not None, f"Attribute {attr} has None"
def validate_raw_gen_ai_request_openai_streaming(span):
expected_attributes = [
"llm.openai.messages",
"llm.openai.temperature",
"llm.openai.user",
"llm.openai.extra_body",
"llm.openai.model",
]
print("span attributes", span._attributes)
for attr in span._attributes:
print(attr)
for attr in expected_attributes:
assert span._attributes[attr] is not None, f"Attribute {attr} has None"
@pytest.mark.asyncio
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.parametrize("global_redact", [True, False])
async def test_awesome_otel_with_message_logging_off(streaming, global_redact):
"""
No content should be logged when message logging is off
tests when litellm.turn_off_message_logging is set to True
tests when OpenTelemetry(message_logging=False) is set
"""
litellm.set_verbose = True
# Clear exporter at the start to ensure clean state
exporter.clear()
litellm.callbacks = [OpenTelemetry(config=OpenTelemetryConfig(exporter=exporter))]
if global_redact is False:
otel_logger = OpenTelemetry(
message_logging=False, config=OpenTelemetryConfig(exporter="console")
)
else:
# use global redaction
litellm.turn_off_message_logging = True
otel_logger = OpenTelemetry(config=OpenTelemetryConfig(exporter="console"))
litellm.callbacks = [otel_logger]
litellm.success_callback = []
litellm.failure_callback = []
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi"}],
mock_response="hi",
stream=streaming,
)
print("response", response)
if streaming is True:
async for chunk in response:
print("chunk", chunk)
await asyncio.sleep(1)
spans = exporter.get_finished_spans()
print("spans", spans)
assert len(spans) == 1
_span = spans[0]
print("span attributes", _span.attributes)
validate_redacted_message_span_attributes(_span)
# clear in memory exporter
exporter.clear()
if global_redact is True:
litellm.turn_off_message_logging = False
def validate_redacted_message_span_attributes(span):
# Required non-metadata attributes that must be present
required_attributes = [
"gen_ai.request.model",
"gen_ai.system",
"llm.is_streaming",
"llm.request.type",
"gen_ai.response.id",
"gen_ai.response.model",
"gen_ai.usage.total_tokens",
"gen_ai.usage.output_tokens",
"gen_ai.usage.input_tokens",
]
_all_attributes = set(
[
name.value if isinstance(name, SpanAttributes) else name
for name in span.attributes.keys()
]
)
print("all_attributes", _all_attributes)
for attr in _all_attributes:
print(f"attr: {attr}, type: {type(attr)}")
# Check that all required attributes are present
required_set = set(required_attributes)
assert required_set.issubset(
_all_attributes
), f"Missing required attributes: {required_set - _all_attributes}"
# Check that any additional attributes are metadata fields (start with "metadata.") or cost fields
non_required_attrs = _all_attributes - required_set
for attr in non_required_attrs:
assert (
attr.startswith("metadata.")
or attr.startswith("hidden_params")
or attr.startswith("gen_ai.cost.")
or attr.startswith("gen_ai.operation.")
or attr.startswith("gen_ai.request.")
), f"Non-metadata attribute found: {attr}"
pass
@pytest.mark.asyncio
async def test_arize_phoenix_creates_nested_spans_on_dedicated_provider():
"""
ArizePhoenixLogger creates its own dedicated TracerProvider so it can
coexist with the generic ``otel`` callback. In proxy mode it creates a
``litellm_proxy_request`` parent span and a ``litellm_request`` child span
on its *own* provider — completely independent of the global provider.
This test verifies:
1. Phoenix creates both parent and child spans on its dedicated exporter.
2. The spans form a proper parent-child hierarchy (same trace ID).
3. A raw_gen_ai_request sub-span is also produced.
"""
from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
phoenix_exporter = InMemorySpanExporter()
litellm.logging_callback_manager._reset_all_callbacks()
# ArizePhoenixLogger builds its own TracerProvider internally.
# We pass our in-memory exporter so we can inspect spans.
phoenix_logger = ArizePhoenixLogger(
config=OpenTelemetryConfig(exporter=phoenix_exporter),
callback_name="arize_phoenix",
)
litellm.callbacks = [phoenix_logger]
litellm.success_callback = []
litellm.failure_callback = []
# Simulate a proxy request by injecting proxy_server_request as a top-level kwarg.
# This triggers ArizePhoenixLogger._get_phoenix_context to create its own parent span.
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "ping"}],
mock_response="pong",
proxy_server_request={
"url": "/chat/completions",
"method": "POST",
"headers": {},
},
)
# Flush async span processing
await asyncio.sleep(1)
spans = phoenix_exporter.get_finished_spans()
span_names = [s.name for s in spans]
# Phoenix creates its own span names on its dedicated TracerProvider:
# - "litellm_proxy_request" (parent) — created by _get_phoenix_context
# - "litellm_request" (child) — the LLM call span
# - "raw_gen_ai_request" — raw request sub-span
assert (
"litellm_proxy_request" in span_names
), f"Expected proxy parent span, got: {span_names}"
assert (
LITELLM_REQUEST_SPAN_NAME in span_names
), f"Expected request child span, got: {span_names}"
assert (
RAW_REQUEST_SPAN_NAME in span_names
), f"Expected raw request span, got: {span_names}"
# All spans should share the same trace ID (proper hierarchy)
trace_ids = {s.context.trace_id for s in spans}
assert len(trace_ids) == 1, f"Expected single trace, got {len(trace_ids)} traces"
phoenix_exporter.clear()