Files
litellm/tests/logging_callback_tests/test_langfuse_e2e_test.py
T
2026-04-17 13:02:59 -07:00

596 lines
22 KiB
Python

import asyncio
import copy
import json
import logging
import os
import sys
import threading
from typing import Any, Optional
from unittest.mock import AsyncMock, MagicMock, patch
import httpx
logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, os.path.abspath("../.."))
import litellm
from litellm import completion
from litellm.caching import InMemoryCache
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
litellm.num_retries = 3
litellm.success_callback = ["langfuse"]
os.environ["LANGFUSE_DEBUG"] = "True"
import time
import pytest
import pytest_asyncio
def assert_langfuse_request_matches_expected(
actual_request_body: dict,
expected_file_name: str,
trace_id: Optional[str] = None,
):
"""
Helper function to compare actual Langfuse request body with expected JSON file.
Args:
actual_request_body (dict): The actual request body received from the API call
expected_file_name (str): Name of the JSON file containing expected request body (e.g., "transcription.json")
"""
# Get the current directory and read the expected request body
pwd = os.path.dirname(os.path.realpath(__file__))
expected_body_path = os.path.join(
pwd, "langfuse_expected_request_body", expected_file_name
)
with open(expected_body_path, "r") as f:
expected_request_body = json.load(f)
# Filter out events that don't match the trace_id
if trace_id:
actual_request_body["batch"] = [
item
for item in actual_request_body["batch"]
if (item["type"] == "trace-create" and item["body"].get("id") == trace_id)
or (
item["type"] == "generation-create"
and item["body"].get("traceId") == trace_id
)
]
# When aggregating from multiple flush cycles, deduplicate by keeping
# only one trace-create and one generation-create per trace_id.
seen_types: dict = {}
deduped_batch: list = []
for item in actual_request_body["batch"]:
item_type = item["type"]
if item_type not in seen_types:
seen_types[item_type] = True
deduped_batch.append(item)
actual_request_body["batch"] = deduped_batch
# Ensure canonical order: trace-create first, generation-create second
actual_request_body["batch"].sort(
key=lambda x: 0 if x["type"] == "trace-create" else 1
)
print(
"actual_request_body after filtering", json.dumps(actual_request_body, indent=4)
)
assert len(actual_request_body["batch"]) >= 2, (
f"Expected at least 2 batch items (trace-create + generation-create) "
f"after filtering by trace_id={trace_id}, "
f"but got {len(actual_request_body['batch'])}. "
f"Items: {json.dumps(actual_request_body['batch'], indent=2)}"
)
# Replace dynamic values in actual request body
for item in actual_request_body["batch"]:
# Replace IDs with expected IDs
if item["type"] == "trace-create":
item["id"] = expected_request_body["batch"][0]["id"]
item["body"]["id"] = expected_request_body["batch"][0]["body"]["id"]
item["timestamp"] = expected_request_body["batch"][0]["timestamp"]
item["body"]["timestamp"] = expected_request_body["batch"][0]["body"][
"timestamp"
]
elif item["type"] == "generation-create":
item["id"] = expected_request_body["batch"][1]["id"]
item["body"]["id"] = expected_request_body["batch"][1]["body"]["id"]
item["timestamp"] = expected_request_body["batch"][1]["timestamp"]
item["body"]["startTime"] = expected_request_body["batch"][1]["body"][
"startTime"
]
item["body"]["endTime"] = expected_request_body["batch"][1]["body"][
"endTime"
]
item["body"]["completionStartTime"] = expected_request_body["batch"][1][
"body"
]["completionStartTime"]
if trace_id is None:
print("popping traceId")
item["body"].pop("traceId")
else:
item["body"]["traceId"] = trace_id
expected_request_body["batch"][1]["body"]["traceId"] = trace_id
# Replace SDK version with expected version
actual_request_body["batch"][0]["body"].pop("release", None)
actual_request_body["metadata"]["sdk_version"] = expected_request_body["metadata"][
"sdk_version"
]
# replace "public_key" with expected public key
actual_request_body["metadata"]["public_key"] = expected_request_body["metadata"][
"public_key"
]
actual_request_body["batch"][1]["body"]["metadata"] = expected_request_body[
"batch"
][1]["body"]["metadata"]
actual_request_body["metadata"]["sdk_integration"] = expected_request_body[
"metadata"
]["sdk_integration"]
actual_request_body["metadata"]["batch_size"] = expected_request_body["metadata"][
"batch_size"
]
# Assert the entire request body matches
assert (
actual_request_body == expected_request_body
), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"
class TestLangfuseLogging:
@pytest_asyncio.fixture
async def mock_setup(self):
"""Common setup for Langfuse logging tests"""
from litellm._uuid import uuid
from unittest.mock import AsyncMock, patch
import httpx
# Create a mock Response object
mock_response = AsyncMock(spec=httpx.Response)
mock_response.status_code = 200
mock_response.json.return_value = {"status": "success"}
# Create mock for httpx.Client.post
mock_post = AsyncMock()
mock_post.return_value = mock_response
litellm.set_verbose = True
litellm.success_callback = ["langfuse"]
return {"trace_id": f"litellm-test-{str(uuid.uuid4())}", "mock_post": mock_post}
async def _verify_langfuse_call(
self,
mock_post,
expected_file_name: str,
trace_id: str,
):
"""Helper method to verify Langfuse API calls"""
await asyncio.sleep(3)
# Verify at least one call was made
assert mock_post.call_count >= 1
# Aggregate batch items from ALL calls — the Langfuse SDK may split
# trace-create and generation-create across separate HTTP flushes.
langfuse_url = "https://us.cloud.langfuse.com/api/public/ingestion"
all_batch_items: list = []
metadata: Optional[dict] = None
for call in mock_post.call_args_list:
url = call[0][0]
if url != langfuse_url:
continue
request_body = call[1].get("content")
if request_body:
body = json.loads(request_body)
all_batch_items.extend(body.get("batch", []))
if metadata is None:
metadata = body.get("metadata")
assert len(all_batch_items) > 0, "No Langfuse ingestion calls found"
assert metadata is not None, "No metadata found in Langfuse calls"
actual_request_body = {
"batch": all_batch_items,
"metadata": metadata,
}
print("\nMocked Request Details (aggregated from all calls):")
print(f"Request Body: {json.dumps(actual_request_body, indent=4)}")
assert_langfuse_request_matches_expected(
actual_request_body,
expected_file_name,
trace_id,
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion(self, mock_setup):
"""Test Langfuse logging for chat completion"""
setup = mock_setup
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata={"trace_id": setup["trace_id"]},
)
await self._verify_langfuse_call(
setup["mock_post"], "completion.json", setup["trace_id"]
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_tags(self, mock_setup):
"""Test Langfuse logging for chat completion with tags"""
setup = mock_setup
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata={
"trace_id": setup["trace_id"],
"tags": ["test_tag", "test_tag_2"],
},
)
await self._verify_langfuse_call(
setup["mock_post"], "completion_with_tags.json", setup["trace_id"]
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_tags_stream(self, mock_setup):
"""Test Langfuse logging for chat completion with tags"""
setup = mock_setup
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata={
"trace_id": setup["trace_id"],
"tags": ["test_tag_stream", "test_tag_2_stream"],
},
)
await self._verify_langfuse_call(
setup["mock_post"],
"completion_with_tags_stream.json",
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_langfuse_metadata(self, mock_setup):
"""Test Langfuse logging for chat completion with metadata for langfuse"""
setup = mock_setup
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata={
"trace_id": setup["trace_id"],
"tags": ["test_tag", "test_tag_2"],
"generation_name": "test_generation_name",
"parent_observation_id": "test_parent_observation_id",
"version": "test_version",
"trace_user_id": "test_user_id",
"session_id": "test_session_id",
"trace_name": "test_trace_name",
"trace_metadata": {"test_key": "test_value"},
"trace_version": "test_trace_version",
"trace_release": "test_trace_release",
},
)
await self._verify_langfuse_call(
setup["mock_post"],
"completion_with_langfuse_metadata.json",
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_with_non_serializable_metadata(self, mock_setup):
"""Test Langfuse logging with metadata that requires preparation (Pydantic models, sets, etc)"""
from pydantic import BaseModel
from typing import Set
import datetime
class UserPreferences(BaseModel):
favorite_colors: Set[str]
last_login: datetime.datetime
settings: dict
setup = mock_setup
test_metadata = {
"user_prefs": UserPreferences(
favorite_colors={"red", "blue"},
last_login=datetime.datetime.now(),
settings={"theme": "dark", "notifications": True},
),
"nested_set": {
"inner_set": {1, 2, 3},
"inner_pydantic": UserPreferences(
favorite_colors={"green", "yellow"},
last_login=datetime.datetime.now(),
settings={"theme": "light"},
),
},
"trace_id": setup["trace_id"],
}
with patch("httpx.Client.post", setup["mock_post"]):
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata=test_metadata,
)
await self._verify_langfuse_call(
setup["mock_post"],
"completion_with_complex_metadata.json",
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_metadata, response_json_file",
[
({"a": 1, "b": 2, "c": 3}, "simple_metadata.json"),
(
{"a": {"nested_a": 1}, "b": {"nested_b": 2}},
"nested_metadata.json",
),
({"a": [1, 2, 3], "b": {4, 5, 6}}, "simple_metadata2.json"),
(
{"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
"simple_metadata3.json",
),
({"lock": threading.Lock()}, "metadata_with_lock.json"),
({"func": lambda x: x + 1}, "metadata_with_function.json"),
(
{
"int": 42,
"str": "hello",
"list": [1, 2, 3],
"set": {4, 5},
"dict": {"nested": "value"},
"non_copyable": threading.Lock(),
"function": print,
},
"complex_metadata.json",
),
(
{"list": ["list", "not", "a", "dict"]},
"complex_metadata_2.json",
),
({}, "empty_metadata.json"),
],
)
@pytest.mark.flaky(retries=6, delay=1)
async def test_langfuse_logging_with_various_metadata_types(
self, mock_setup, test_metadata, response_json_file
):
"""Test Langfuse logging with various metadata types including non-serializable objects"""
import threading
setup = mock_setup
if test_metadata is not None:
test_metadata["trace_id"] = setup["trace_id"]
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response="Hello! How can I assist you today?",
metadata=test_metadata,
)
await self._verify_langfuse_call(
setup["mock_post"],
response_json_file,
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_malformed_llm_response(
self, mock_setup
):
"""Test Langfuse logging for chat completion with malformed LLM response"""
setup = mock_setup
litellm._turn_on_debug()
with patch("httpx.Client.post", setup["mock_post"]):
mock_response = litellm.ModelResponse(
choices=[],
usage=litellm.Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
),
model="gpt-3.5-turbo",
object="chat.completion",
created=1723081200,
).model_dump()
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response=mock_response,
metadata={"trace_id": setup["trace_id"]},
)
await self._verify_langfuse_call(
setup["mock_post"], "completion_with_no_choices.json", setup["trace_id"]
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_bedrock_llm_response(
self, mock_setup
):
"""Test Langfuse logging for chat completion with malformed LLM response"""
setup = mock_setup
litellm._turn_on_debug()
with patch("httpx.Client.post", setup["mock_post"]):
mock_response = litellm.ModelResponse(
choices=[],
usage=litellm.Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
),
model="anthropic.claude-haiku-4-5-20251001-v1:0",
object="chat.completion",
created=1723081200,
).model_dump()
await litellm.acompletion(
model="bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0",
messages=[{"role": "user", "content": "Hello!"}],
mock_response=mock_response,
metadata={"trace_id": setup["trace_id"]},
aws_access_key_id="fake-key",
aws_secret_access_key="fake-key",
aws_region="us-east-1",
)
await self._verify_langfuse_call(
setup["mock_post"],
"completion_with_bedrock_call.json",
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_completion_with_vertex_llm_response(
self, mock_setup
):
"""Test Langfuse logging for chat completion with malformed LLM response"""
setup = mock_setup
litellm._turn_on_debug()
with patch("httpx.Client.post", setup["mock_post"]):
mock_response = litellm.ModelResponse(
choices=[],
usage=litellm.Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
),
model="vertex/gemini-2.0-flash-001",
object="chat.completion",
created=1723081200,
).model_dump()
await litellm.acompletion(
model="vertex_ai/gemini-2.0-flash-001",
messages=[{"role": "user", "content": "Hello!"}],
mock_response=mock_response,
metadata={"trace_id": setup["trace_id"]},
vertex_credentials="my-mock-credentials",
api_key="my-mock-credentials-2",
)
await self._verify_langfuse_call(
setup["mock_post"],
"completion_with_vertex_call.json",
setup["trace_id"],
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_vllm_embedding(self, mock_setup):
"""
Test that the request sent to the vllm embedding endpoint is correct.
Verifies the request body matches the expected JSON fixture,
including that the hosted_vllm/ prefix is stripped from the model name
and that no unexpected fields (e.g. encoding_format) are included.
"""
setup = mock_setup
vllm_response_data = {
"object": "list",
"data": [{"object": "embedding", "index": 0, "embedding": [0.1, 0.2, 0.3]}],
"model": "BAAI/bge-small-en-v1.5",
"usage": {"prompt_tokens": 10, "total_tokens": 10},
}
mock_vllm_response = httpx.Response(
status_code=200,
json=vllm_response_data,
)
mock_async_client = AsyncHTTPHandler()
mock_async_client.post = AsyncMock(return_value=mock_vllm_response)
with patch("httpx.Client.post", setup["mock_post"]):
await litellm.aembedding(
model="hosted_vllm/BAAI/bge-small-en-v1.5",
input=["Hello from litellm!"],
api_base="http://my-fake-vllm.com/v1",
metadata={"trace_id": setup["trace_id"]},
client=mock_async_client,
)
# Verify the request sent to vllm matches the expected JSON fixture
assert mock_async_client.post.call_count == 1
actual_vllm_request = mock_async_client.post.call_args.kwargs["json"]
pwd = os.path.dirname(os.path.realpath(__file__))
expected_body_path = os.path.join(
pwd, "langfuse_expected_request_body", "embedding_with_vllm.json"
)
with open(expected_body_path, "r") as f:
expected_vllm_request = json.load(f)
assert actual_vllm_request == expected_vllm_request, (
f"vllm request body mismatch:\n"
f"actual: {json.dumps(actual_vllm_request, indent=2)}\n"
f"expected: {json.dumps(expected_vllm_request, indent=2)}"
)
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_langfuse_logging_with_router(self, mock_setup):
"""Test Langfuse logging with router"""
litellm._turn_on_debug()
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"mock_response": "Hello! How can I assist you today?",
"api_key": "test_api_key",
},
}
]
)
with patch("httpx.Client.post", mock_setup["mock_post"]):
mock_response = litellm.ModelResponse(
choices=[],
usage=litellm.Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
),
model="gpt-3.5-turbo",
object="chat.completion",
created=1723081200,
).model_dump()
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
mock_response=mock_response,
metadata={"trace_id": mock_setup["trace_id"]},
)
await self._verify_langfuse_call(
mock_setup["mock_post"],
"completion_with_router.json",
mock_setup["trace_id"],
)