litellm/tests/logging_callback_tests/test_langfuse_e2e_test.py

import asyncio
import copy
import json
import logging
import os
import sys
import threading
from typing import Any, Optional
from unittest.mock import AsyncMock, MagicMock, patch

import httpx

logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, os.path.abspath("../.."))

import litellm
from litellm import completion
from litellm.caching import InMemoryCache
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler

litellm.num_retries = 3
litellm.success_callback = ["langfuse"]
os.environ["LANGFUSE_DEBUG"] = "True"
import time

import pytest
import pytest_asyncio


def assert_langfuse_request_matches_expected(
    actual_request_body: dict,
    expected_file_name: str,
    trace_id: Optional[str] = None,
):
    """
    Helper function to compare actual Langfuse request body with expected JSON file.

    Args:
        actual_request_body (dict): The actual request body received from the API call
        expected_file_name (str): Name of the JSON file containing expected request body (e.g., "transcription.json")
    """
    # Get the current directory and read the expected request body
    pwd = os.path.dirname(os.path.realpath(__file__))
    expected_body_path = os.path.join(
        pwd, "langfuse_expected_request_body", expected_file_name
    )

    with open(expected_body_path, "r") as f:
        expected_request_body = json.load(f)

    # Filter out events that don't match the trace_id
    if trace_id:
        actual_request_body["batch"] = [
            item
            for item in actual_request_body["batch"]
            if (item["type"] == "trace-create" and item["body"].get("id") == trace_id)
            or (
                item["type"] == "generation-create"
                and item["body"].get("traceId") == trace_id
            )
        ]

    # When aggregating from multiple flush cycles, deduplicate by keeping
    # only one trace-create and one generation-create per trace_id.
    seen_types: dict = {}
    deduped_batch: list = []
    for item in actual_request_body["batch"]:
        item_type = item["type"]
        if item_type not in seen_types:
            seen_types[item_type] = True
            deduped_batch.append(item)
    actual_request_body["batch"] = deduped_batch

    # Ensure canonical order: trace-create first, generation-create second
    actual_request_body["batch"].sort(
        key=lambda x: 0 if x["type"] == "trace-create" else 1
    )

    print(
        "actual_request_body after filtering", json.dumps(actual_request_body, indent=4)
    )

    assert len(actual_request_body["batch"]) >= 2, (
        f"Expected at least 2 batch items (trace-create + generation-create) "
        f"after filtering by trace_id={trace_id}, "
        f"but got {len(actual_request_body['batch'])}. "
        f"Items: {json.dumps(actual_request_body['batch'], indent=2)}"
    )

    # Replace dynamic values in actual request body
    for item in actual_request_body["batch"]:

        # Replace IDs with expected IDs
        if item["type"] == "trace-create":
            item["id"] = expected_request_body["batch"][0]["id"]
            item["body"]["id"] = expected_request_body["batch"][0]["body"]["id"]
            item["timestamp"] = expected_request_body["batch"][0]["timestamp"]
            item["body"]["timestamp"] = expected_request_body["batch"][0]["body"][
                "timestamp"
            ]
        elif item["type"] == "generation-create":
            item["id"] = expected_request_body["batch"][1]["id"]
            item["body"]["id"] = expected_request_body["batch"][1]["body"]["id"]
            item["timestamp"] = expected_request_body["batch"][1]["timestamp"]
            item["body"]["startTime"] = expected_request_body["batch"][1]["body"][
                "startTime"
            ]
            item["body"]["endTime"] = expected_request_body["batch"][1]["body"][
                "endTime"
            ]
            item["body"]["completionStartTime"] = expected_request_body["batch"][1][
                "body"
            ]["completionStartTime"]
            if trace_id is None:
                print("popping traceId")
                item["body"].pop("traceId")
            else:
                item["body"]["traceId"] = trace_id
                expected_request_body["batch"][1]["body"]["traceId"] = trace_id

    # Replace SDK version with expected version
    actual_request_body["batch"][0]["body"].pop("release", None)
    actual_request_body["metadata"]["sdk_version"] = expected_request_body["metadata"][
        "sdk_version"
    ]
    # replace "public_key" with expected public key
    actual_request_body["metadata"]["public_key"] = expected_request_body["metadata"][
        "public_key"
    ]
    actual_request_body["batch"][1]["body"]["metadata"] = expected_request_body[
        "batch"
    ][1]["body"]["metadata"]
    actual_request_body["metadata"]["sdk_integration"] = expected_request_body[
        "metadata"
    ]["sdk_integration"]
    actual_request_body["metadata"]["batch_size"] = expected_request_body["metadata"][
        "batch_size"
    ]
    # Assert the entire request body matches
    assert (
        actual_request_body == expected_request_body
    ), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"


class TestLangfuseLogging:
    @pytest_asyncio.fixture
    async def mock_setup(self):
        """Common setup for Langfuse logging tests"""
        from litellm._uuid import uuid
        from unittest.mock import AsyncMock, patch
        import httpx

        # Create a mock Response object
        mock_response = AsyncMock(spec=httpx.Response)
        mock_response.status_code = 200
        mock_response.json.return_value = {"status": "success"}

        # Create mock for httpx.Client.post
        mock_post = AsyncMock()
        mock_post.return_value = mock_response

        litellm.set_verbose = True
        litellm.success_callback = ["langfuse"]

        return {"trace_id": f"litellm-test-{str(uuid.uuid4())}", "mock_post": mock_post}

    async def _verify_langfuse_call(
        self,
        mock_post,
        expected_file_name: str,
        trace_id: str,
    ):
        """Helper method to verify Langfuse API calls"""
        await asyncio.sleep(3)

        # Verify at least one call was made
        assert mock_post.call_count >= 1

        # Aggregate batch items from ALL calls — the Langfuse SDK may split
        # trace-create and generation-create across separate HTTP flushes.
        langfuse_url = "https://us.cloud.langfuse.com/api/public/ingestion"
        all_batch_items: list = []
        metadata: Optional[dict] = None
        for call in mock_post.call_args_list:
            url = call[0][0]
            if url != langfuse_url:
                continue
            request_body = call[1].get("content")
            if request_body:
                body = json.loads(request_body)
                all_batch_items.extend(body.get("batch", []))
                if metadata is None:
                    metadata = body.get("metadata")

        assert len(all_batch_items) > 0, "No Langfuse ingestion calls found"
        assert metadata is not None, "No metadata found in Langfuse calls"

        actual_request_body = {
            "batch": all_batch_items,
            "metadata": metadata,
        }

        print("\nMocked Request Details (aggregated from all calls):")
        print(f"Request Body: {json.dumps(actual_request_body, indent=4)}")

        assert_langfuse_request_matches_expected(
            actual_request_body,
            expected_file_name,
            trace_id,
        )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion(self, mock_setup):
        """Test Langfuse logging for chat completion"""
        setup = mock_setup
        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata={"trace_id": setup["trace_id"]},
            )
            await self._verify_langfuse_call(
                setup["mock_post"], "completion.json", setup["trace_id"]
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_tags(self, mock_setup):
        """Test Langfuse logging for chat completion with tags"""
        setup = mock_setup
        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata={
                    "trace_id": setup["trace_id"],
                    "tags": ["test_tag", "test_tag_2"],
                },
            )
            await self._verify_langfuse_call(
                setup["mock_post"], "completion_with_tags.json", setup["trace_id"]
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_tags_stream(self, mock_setup):
        """Test Langfuse logging for chat completion with tags"""
        setup = mock_setup
        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata={
                    "trace_id": setup["trace_id"],
                    "tags": ["test_tag_stream", "test_tag_2_stream"],
                },
            )
            await self._verify_langfuse_call(
                setup["mock_post"],
                "completion_with_tags_stream.json",
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_langfuse_metadata(self, mock_setup):
        """Test Langfuse logging for chat completion with metadata for langfuse"""
        setup = mock_setup
        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata={
                    "trace_id": setup["trace_id"],
                    "tags": ["test_tag", "test_tag_2"],
                    "generation_name": "test_generation_name",
                    "parent_observation_id": "test_parent_observation_id",
                    "version": "test_version",
                    "trace_user_id": "test_user_id",
                    "session_id": "test_session_id",
                    "trace_name": "test_trace_name",
                    "trace_metadata": {"test_key": "test_value"},
                    "trace_version": "test_trace_version",
                    "trace_release": "test_trace_release",
                },
            )
            await self._verify_langfuse_call(
                setup["mock_post"],
                "completion_with_langfuse_metadata.json",
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_with_non_serializable_metadata(self, mock_setup):
        """Test Langfuse logging with metadata that requires preparation (Pydantic models, sets, etc)"""
        from pydantic import BaseModel
        from typing import Set
        import datetime

        class UserPreferences(BaseModel):
            favorite_colors: Set[str]
            last_login: datetime.datetime
            settings: dict

        setup = mock_setup

        test_metadata = {
            "user_prefs": UserPreferences(
                favorite_colors={"red", "blue"},
                last_login=datetime.datetime.now(),
                settings={"theme": "dark", "notifications": True},
            ),
            "nested_set": {
                "inner_set": {1, 2, 3},
                "inner_pydantic": UserPreferences(
                    favorite_colors={"green", "yellow"},
                    last_login=datetime.datetime.now(),
                    settings={"theme": "light"},
                ),
            },
            "trace_id": setup["trace_id"],
        }

        with patch("httpx.Client.post", setup["mock_post"]):
            response = await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata=test_metadata,
            )

            await self._verify_langfuse_call(
                setup["mock_post"],
                "completion_with_complex_metadata.json",
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.parametrize(
        "test_metadata, response_json_file",
        [
            ({"a": 1, "b": 2, "c": 3}, "simple_metadata.json"),
            (
                {"a": {"nested_a": 1}, "b": {"nested_b": 2}},
                "nested_metadata.json",
            ),
            ({"a": [1, 2, 3], "b": {4, 5, 6}}, "simple_metadata2.json"),
            (
                {"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
                "simple_metadata3.json",
            ),
            ({"lock": threading.Lock()}, "metadata_with_lock.json"),
            ({"func": lambda x: x + 1}, "metadata_with_function.json"),
            (
                {
                    "int": 42,
                    "str": "hello",
                    "list": [1, 2, 3],
                    "set": {4, 5},
                    "dict": {"nested": "value"},
                    "non_copyable": threading.Lock(),
                    "function": print,
                },
                "complex_metadata.json",
            ),
            (
                {"list": ["list", "not", "a", "dict"]},
                "complex_metadata_2.json",
            ),
            ({}, "empty_metadata.json"),
        ],
    )
    @pytest.mark.flaky(retries=6, delay=1)
    async def test_langfuse_logging_with_various_metadata_types(
        self, mock_setup, test_metadata, response_json_file
    ):
        """Test Langfuse logging with various metadata types including non-serializable objects"""
        import threading

        setup = mock_setup

        if test_metadata is not None:
            test_metadata["trace_id"] = setup["trace_id"]

        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response="Hello! How can I assist you today?",
                metadata=test_metadata,
            )

            await self._verify_langfuse_call(
                setup["mock_post"],
                response_json_file,
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_malformed_llm_response(
        self, mock_setup
    ):
        """Test Langfuse logging for chat completion with malformed LLM response"""
        setup = mock_setup
        litellm._turn_on_debug()
        with patch("httpx.Client.post", setup["mock_post"]):
            mock_response = litellm.ModelResponse(
                choices=[],
                usage=litellm.Usage(
                    prompt_tokens=10,
                    completion_tokens=10,
                    total_tokens=20,
                ),
                model="gpt-3.5-turbo",
                object="chat.completion",
                created=1723081200,
            ).model_dump()
            await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response=mock_response,
                metadata={"trace_id": setup["trace_id"]},
            )
            await self._verify_langfuse_call(
                setup["mock_post"], "completion_with_no_choices.json", setup["trace_id"]
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_bedrock_llm_response(
        self, mock_setup
    ):
        """Test Langfuse logging for chat completion with malformed LLM response"""
        setup = mock_setup
        litellm._turn_on_debug()
        with patch("httpx.Client.post", setup["mock_post"]):
            mock_response = litellm.ModelResponse(
                choices=[],
                usage=litellm.Usage(
                    prompt_tokens=10,
                    completion_tokens=10,
                    total_tokens=20,
                ),
                model="anthropic.claude-haiku-4-5-20251001-v1:0",
                object="chat.completion",
                created=1723081200,
            ).model_dump()
            await litellm.acompletion(
                model="bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response=mock_response,
                metadata={"trace_id": setup["trace_id"]},
                aws_access_key_id="fake-key",
                aws_secret_access_key="fake-key",
                aws_region="us-east-1",
            )
            await self._verify_langfuse_call(
                setup["mock_post"],
                "completion_with_bedrock_call.json",
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_completion_with_vertex_llm_response(
        self, mock_setup
    ):
        """Test Langfuse logging for chat completion with malformed LLM response"""
        setup = mock_setup
        litellm._turn_on_debug()
        with patch("httpx.Client.post", setup["mock_post"]):
            mock_response = litellm.ModelResponse(
                choices=[],
                usage=litellm.Usage(
                    prompt_tokens=10,
                    completion_tokens=10,
                    total_tokens=20,
                ),
                model="vertex/gemini-2.0-flash-001",
                object="chat.completion",
                created=1723081200,
            ).model_dump()
            await litellm.acompletion(
                model="vertex_ai/gemini-2.0-flash-001",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response=mock_response,
                metadata={"trace_id": setup["trace_id"]},
                vertex_credentials="my-mock-credentials",
                api_key="my-mock-credentials-2",
            )
            await self._verify_langfuse_call(
                setup["mock_post"],
                "completion_with_vertex_call.json",
                setup["trace_id"],
            )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_vllm_embedding(self, mock_setup):
        """
        Test that the request sent to the vllm embedding endpoint is correct.

        Verifies the request body matches the expected JSON fixture,
        including that the hosted_vllm/ prefix is stripped from the model name
        and that no unexpected fields (e.g. encoding_format) are included.
        """
        setup = mock_setup

        vllm_response_data = {
            "object": "list",
            "data": [{"object": "embedding", "index": 0, "embedding": [0.1, 0.2, 0.3]}],
            "model": "BAAI/bge-small-en-v1.5",
            "usage": {"prompt_tokens": 10, "total_tokens": 10},
        }
        mock_vllm_response = httpx.Response(
            status_code=200,
            json=vllm_response_data,
        )

        mock_async_client = AsyncHTTPHandler()
        mock_async_client.post = AsyncMock(return_value=mock_vllm_response)

        with patch("httpx.Client.post", setup["mock_post"]):
            await litellm.aembedding(
                model="hosted_vllm/BAAI/bge-small-en-v1.5",
                input=["Hello from litellm!"],
                api_base="http://my-fake-vllm.com/v1",
                metadata={"trace_id": setup["trace_id"]},
                client=mock_async_client,
            )

        # Verify the request sent to vllm matches the expected JSON fixture
        assert mock_async_client.post.call_count == 1
        actual_vllm_request = mock_async_client.post.call_args.kwargs["json"]

        pwd = os.path.dirname(os.path.realpath(__file__))
        expected_body_path = os.path.join(
            pwd, "langfuse_expected_request_body", "embedding_with_vllm.json"
        )
        with open(expected_body_path, "r") as f:
            expected_vllm_request = json.load(f)

        assert actual_vllm_request == expected_vllm_request, (
            f"vllm request body mismatch:\n"
            f"actual:   {json.dumps(actual_vllm_request, indent=2)}\n"
            f"expected: {json.dumps(expected_vllm_request, indent=2)}"
        )

    @pytest.mark.asyncio
    @pytest.mark.flaky(retries=3, delay=1)
    async def test_langfuse_logging_with_router(self, mock_setup):
        """Test Langfuse logging with router"""
        litellm._turn_on_debug()
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "gpt-3.5-turbo",
                    "litellm_params": {
                        "model": "gpt-3.5-turbo",
                        "mock_response": "Hello! How can I assist you today?",
                        "api_key": "test_api_key",
                    },
                }
            ]
        )
        with patch("httpx.Client.post", mock_setup["mock_post"]):
            mock_response = litellm.ModelResponse(
                choices=[],
                usage=litellm.Usage(
                    prompt_tokens=10,
                    completion_tokens=10,
                    total_tokens=20,
                ),
                model="gpt-3.5-turbo",
                object="chat.completion",
                created=1723081200,
            ).model_dump()
            await router.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello!"}],
                mock_response=mock_response,
                metadata={"trace_id": mock_setup["trace_id"]},
            )
            await self._verify_langfuse_call(
                mock_setup["mock_post"],
                "completion_with_router.json",
                mock_setup["trace_id"],
            )