Files
litellm/tests/logging_callback_tests/test_otel_logging.py
T
mubashir1osmani 3c595f6fd2 fix(test): update Phoenix OTEL test for dedicated TracerProvider architecture
The old test assumed ArizePhoenixLogger reused the global TracerProvider.
With the nested traces fix, Phoenix now creates its own dedicated provider
and produces litellm_proxy_request + litellm_request + raw_gen_ai_request
spans independently.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-26 06:40:24 -05:00

319 lines
10 KiB
Python

import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system-path
import pytest
import litellm
import asyncio
import logging
from opentelemetry import trace
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from litellm._logging import verbose_logger
from litellm.integrations.arize.arize_phoenix import ArizePhoenixLogger
from litellm.integrations._types.open_inference import (
OpenInferenceSpanKindValues,
SpanAttributes as OISpanAttributes,
)
from litellm.integrations.opentelemetry import (
LITELLM_PROXY_REQUEST_SPAN_NAME,
LITELLM_TRACER_NAME,
LITELLM_REQUEST_SPAN_NAME,
OpenTelemetry,
OpenTelemetryConfig,
RAW_REQUEST_SPAN_NAME,
Span,
)
from litellm.proxy._types import SpanAttributes
verbose_logger.setLevel(logging.DEBUG)
EXPECTED_SPAN_NAMES = ["litellm_request", "raw_gen_ai_request"]
exporter = InMemorySpanExporter()
@pytest.mark.asyncio
@pytest.mark.parametrize("streaming", [True, False])
async def test_async_otel_callback(streaming):
litellm.set_verbose = True
# Clear exporter at the start to ensure clean state
exporter.clear()
litellm.callbacks = [OpenTelemetry(config=OpenTelemetryConfig(exporter=exporter))]
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi"}],
temperature=0.1,
user="OTEL_USER",
stream=streaming,
)
if streaming is True:
async for chunk in response:
print("chunk", chunk)
await asyncio.sleep(4)
spans = exporter.get_finished_spans()
print("spans", spans)
assert len(spans) == 2
_span_names = [span.name for span in spans]
print("recorded span names", _span_names)
assert set(_span_names) == set(EXPECTED_SPAN_NAMES)
# print the value of a span
for span in spans:
print("span name", span.name)
print("span attributes", span.attributes)
if span.name == "litellm_request":
validate_litellm_request(span)
# Additional specific checks
assert span._attributes["gen_ai.request.model"] == "gpt-3.5-turbo"
assert span._attributes["gen_ai.system"] == "openai"
assert span._attributes["gen_ai.request.temperature"] == 0.1
assert span._attributes["llm.is_streaming"] == str(streaming)
assert span._attributes["llm.user"] == "OTEL_USER"
elif span.name == "raw_gen_ai_request":
if streaming is True:
validate_raw_gen_ai_request_openai_streaming(span)
else:
validate_raw_gen_ai_request_openai_non_streaming(span)
# clear in memory exporter
exporter.clear()
def validate_litellm_request(span):
expected_attributes = [
"gen_ai.request.model",
"gen_ai.system",
"gen_ai.request.temperature",
"llm.is_streaming",
"llm.user",
"gen_ai.response.id",
"gen_ai.response.model",
"gen_ai.usage.total_tokens",
"gen_ai.usage.output_tokens",
"gen_ai.usage.input_tokens",
]
# get the str of all the span attributes
print("span attributes", span._attributes)
for attr in expected_attributes:
value = span._attributes[attr]
print("value", value)
assert value is not None, f"Attribute {attr} has None value"
def validate_raw_gen_ai_request_openai_non_streaming(span):
expected_attributes = [
"llm.openai.messages",
"llm.openai.temperature",
"llm.openai.user",
"llm.openai.extra_body",
"llm.openai.id",
"llm.openai.choices",
"llm.openai.created",
"llm.openai.model",
"llm.openai.object",
"llm.openai.service_tier",
"llm.openai.system_fingerprint",
"llm.openai.usage",
]
print("span attributes", span._attributes)
for attr in span._attributes:
print(attr)
for attr in expected_attributes:
assert span._attributes[attr] is not None, f"Attribute {attr} has None"
def validate_raw_gen_ai_request_openai_streaming(span):
expected_attributes = [
"llm.openai.messages",
"llm.openai.temperature",
"llm.openai.user",
"llm.openai.extra_body",
"llm.openai.model",
]
print("span attributes", span._attributes)
for attr in span._attributes:
print(attr)
for attr in expected_attributes:
assert span._attributes[attr] is not None, f"Attribute {attr} has None"
@pytest.mark.asyncio
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.parametrize("global_redact", [True, False])
async def test_awesome_otel_with_message_logging_off(streaming, global_redact):
"""
No content should be logged when message logging is off
tests when litellm.turn_off_message_logging is set to True
tests when OpenTelemetry(message_logging=False) is set
"""
litellm.set_verbose = True
# Clear exporter at the start to ensure clean state
exporter.clear()
litellm.callbacks = [OpenTelemetry(config=OpenTelemetryConfig(exporter=exporter))]
if global_redact is False:
otel_logger = OpenTelemetry(
message_logging=False, config=OpenTelemetryConfig(exporter="console")
)
else:
# use global redaction
litellm.turn_off_message_logging = True
otel_logger = OpenTelemetry(config=OpenTelemetryConfig(exporter="console"))
litellm.callbacks = [otel_logger]
litellm.success_callback = []
litellm.failure_callback = []
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi"}],
mock_response="hi",
stream=streaming,
)
print("response", response)
if streaming is True:
async for chunk in response:
print("chunk", chunk)
await asyncio.sleep(1)
spans = exporter.get_finished_spans()
print("spans", spans)
assert len(spans) == 1
_span = spans[0]
print("span attributes", _span.attributes)
validate_redacted_message_span_attributes(_span)
# clear in memory exporter
exporter.clear()
if global_redact is True:
litellm.turn_off_message_logging = False
def validate_redacted_message_span_attributes(span):
# Required non-metadata attributes that must be present
required_attributes = [
"gen_ai.request.model",
"gen_ai.system",
"llm.is_streaming",
"llm.request.type",
"gen_ai.response.id",
"gen_ai.response.model",
"gen_ai.usage.total_tokens",
"gen_ai.usage.output_tokens",
"gen_ai.usage.input_tokens",
]
_all_attributes = set(
[
name.value if isinstance(name, SpanAttributes) else name
for name in span.attributes.keys()
]
)
print("all_attributes", _all_attributes)
for attr in _all_attributes:
print(f"attr: {attr}, type: {type(attr)}")
# Check that all required attributes are present
required_set = set(required_attributes)
assert required_set.issubset(
_all_attributes
), f"Missing required attributes: {required_set - _all_attributes}"
# Check that any additional attributes are metadata fields (start with "metadata.") or cost fields
non_required_attrs = _all_attributes - required_set
for attr in non_required_attrs:
assert (
attr.startswith("metadata.")
or attr.startswith("hidden_params")
or attr.startswith("gen_ai.cost.")
or attr.startswith("gen_ai.operation.")
or attr.startswith("gen_ai.request.")
), f"Non-metadata attribute found: {attr}"
pass
@pytest.mark.asyncio
async def test_arize_phoenix_creates_nested_spans_on_dedicated_provider():
"""
ArizePhoenixLogger creates its own dedicated TracerProvider so it can
coexist with the generic ``otel`` callback. In proxy mode it creates a
``litellm_proxy_request`` parent span and a ``litellm_request`` child span
on its *own* provider — completely independent of the global provider.
This test verifies:
1. Phoenix creates both parent and child spans on its dedicated exporter.
2. The spans form a proper parent-child hierarchy (same trace ID).
3. A raw_gen_ai_request sub-span is also produced.
"""
from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
phoenix_exporter = InMemorySpanExporter()
litellm.logging_callback_manager._reset_all_callbacks()
# ArizePhoenixLogger builds its own TracerProvider internally.
# We pass our in-memory exporter so we can inspect spans.
phoenix_logger = ArizePhoenixLogger(
config=OpenTelemetryConfig(exporter=phoenix_exporter),
callback_name="arize_phoenix",
)
litellm.callbacks = [phoenix_logger]
litellm.success_callback = []
litellm.failure_callback = []
# Simulate a proxy request by injecting proxy_server_request as a top-level kwarg.
# This triggers ArizePhoenixLogger._get_phoenix_context to create its own parent span.
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "ping"}],
mock_response="pong",
proxy_server_request={"url": "/chat/completions", "method": "POST", "headers": {}},
)
# Flush async span processing
await asyncio.sleep(1)
spans = phoenix_exporter.get_finished_spans()
span_names = [s.name for s in spans]
# Phoenix creates its own span names on its dedicated TracerProvider:
# - "litellm_proxy_request" (parent) — created by _get_phoenix_context
# - "litellm_request" (child) — the LLM call span
# - "raw_gen_ai_request" — raw request sub-span
assert "litellm_proxy_request" in span_names, f"Expected proxy parent span, got: {span_names}"
assert LITELLM_REQUEST_SPAN_NAME in span_names, f"Expected request child span, got: {span_names}"
assert RAW_REQUEST_SPAN_NAME in span_names, f"Expected raw request span, got: {span_names}"
# All spans should share the same trace ID (proper hierarchy)
trace_ids = {s.context.trace_id for s in spans}
assert len(trace_ids) == 1, f"Expected single trace, got {len(trace_ids)} traces"
phoenix_exporter.clear()