litellm/tests/local_testing/test_opik.py

import io
import os
import sys

sys.path.insert(0, os.path.abspath("../.."))

import asyncio
import logging

import pytest

import litellm
from litellm._logging import verbose_logger
from unittest.mock import AsyncMock, Mock

verbose_logger.setLevel(logging.DEBUG)

litellm.set_verbose = True
import time

@pytest.mark.asyncio
async def test_opik_logging_http_request():
    """
    - Test that HTTP requests are made to Opik
    - Traces and spans are batched correctly
    """
    try:
        from litellm.integrations.opik.opik import OpikLogger

        os.environ["OPIK_URL_OVERRIDE"] = "https://fake.comet.com/opik/api"
        os.environ["OPIK_API_KEY"] = "anything"
        os.environ["OPIK_WORKSPACE"] = "anything"

        # Initialize OpikLogger
        test_opik_logger = OpikLogger()

        litellm.callbacks = [test_opik_logger]
        test_opik_logger.batch_size = 12
        litellm.set_verbose = True

        # Create a mock for the async_client's post method
        mock_post = AsyncMock()
        mock_post.return_value.status_code = 202
        mock_post.return_value.text = "Accepted"
        test_opik_logger.async_httpx_client.post = mock_post

        # Make multiple calls to ensure we don't hit the batch size
        for _ in range(5):
            response = await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Test message"}],
                max_tokens=10,
                temperature=0.2,
                mock_response="This is a mock response",
            )
        await asyncio.sleep(1)

        # Check batching of events and that the queue contains 5 trace events and 5 span events
        assert mock_post.called == False, "HTTP request was made but events should have been batched"
        assert len(test_opik_logger.log_queue) == 10

        # Now make calls to exceed the batch size
        for _ in range(3):
            response = await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Test message"}],
                max_tokens=10,
                temperature=0.2,
                mock_response="This is a mock response",
            )

        # Wait a short time for any asynchronous operations to complete
        await asyncio.sleep(1)

        # Check that the queue was flushed after exceeding batch size
        assert len(test_opik_logger.log_queue) < test_opik_logger.batch_size

        # Check that the data has been sent when it goes above the flush interval
        await asyncio.sleep(test_opik_logger.flush_interval)
        assert len(test_opik_logger.log_queue) == 0

        # Clean up
        for cb in litellm.callbacks:
            if isinstance(cb, OpikLogger):
                await cb.async_httpx_client.client.aclose()

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

def test_sync_opik_logging_http_request():
    """
    - Test that HTTP requests are made to Opik
    - Traces and spans are batched correctly
    """
    try:
        from litellm.integrations.opik.opik import OpikLogger

        os.environ["OPIK_URL_OVERRIDE"] = "https://fake.comet.com/opik/api"
        os.environ["OPIK_API_KEY"] = "anything"
        os.environ["OPIK_WORKSPACE"] = "anything"

        # Initialize OpikLogger
        test_opik_logger = OpikLogger()

        litellm.callbacks = [test_opik_logger]
        litellm.set_verbose = True

        # Create a mock for the clients's post method
        mock_post = Mock()
        mock_post.return_value.status_code = 204
        mock_post.return_value.text = "Accepted"
        test_opik_logger.sync_httpx_client.post = mock_post

        # Make multiple calls to ensure we don't hit the batch size
        for _ in range(5):
            response = litellm.completion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Test message"}],
                max_tokens=10,
                temperature=0.2,
                mock_response="This is a mock response",
            )

        # Need to wait for a short amount of time as the log_success callback is called in a different thread. One or two seconds is often not enough.
        time.sleep(3)

        # Check that 5 spans and 5 traces were sent
        assert mock_post.call_count == 10, f"Expected 10 HTTP requests, but got {mock_post.call_count}"

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@pytest.mark.asyncio
@pytest.mark.skip(reason="local-only test, to test if everything works fine.")
async def test_opik_logging():
    try:
        from litellm.integrations.opik.opik import OpikLogger

        # Initialize OpikLogger
        test_opik_logger = OpikLogger()
        litellm.callbacks = [test_opik_logger]
        litellm.set_verbose = True

        # Log a chat completion call
        response = await litellm.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "What LLM are you ?"}],
            max_tokens=10,
            temperature=0.2,
            metadata={"opik": {"custom_field": "custom_value"}}
        )
        print("Non-streaming response:", response)

        # Log a streaming completion call
        stream_response = await litellm.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Stream = True - What llm are you ?"}],
            max_tokens=10,
            temperature=0.2,
            stream=True,
            metadata={"opik": {"custom_field": "custom_value"}}
        )
        print("Streaming response:")
        async for chunk in stream_response:
            print(chunk.choices[0].delta.content, end='', flush=True)
        print()  # New line after streaming response

        await asyncio.sleep(2)

        assert len(test_opik_logger.log_queue) == 4

        await asyncio.sleep(test_opik_logger.flush_interval + 1)
        assert len(test_opik_logger.log_queue) == 0
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")


def test_opik_attach_to_existing_trace():
    """
    Test attaching spans to existing trace (regression fix for PR #14888)

    - When trace_id is provided via current_span_data, only create a span
    - Do NOT create a new trace (this was the bug)
    - Verify span has correct trace_id and parent_span_id
    """
    try:
        from litellm.integrations.opik.opik import OpikLogger

        os.environ["OPIK_URL_OVERRIDE"] = "https://fake.comet.com/opik/api"
        os.environ["OPIK_API_KEY"] = "anything"
        os.environ["OPIK_WORKSPACE"] = "anything"

        # Initialize OpikLogger
        test_opik_logger = OpikLogger()
        litellm.callbacks = [test_opik_logger]
        litellm.set_verbose = True

        # Create a mock for the sync client's post method
        mock_post = Mock()
        mock_post.return_value.status_code = 204
        mock_post.return_value.text = "Accepted"
        test_opik_logger.sync_httpx_client.post = mock_post

        # Simulate an existing trace and parent span
        existing_trace_id = "existing-trace-12345"
        existing_parent_span_id = "existing-span-67890"

        # Make a completion call with existing trace_id
        response = litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Test message"}],
            max_tokens=10,
            temperature=0.2,
            mock_response="This is a mock response",
            metadata={
                "opik": {
                    "current_span_data": {
                        "trace_id": existing_trace_id,
                        "id": existing_parent_span_id
                    },
                    "tags": ["test-attach-span"]
                }
            }
        )

        # Need to wait for a short amount of time as the log_success callback is called in a different thread
        time.sleep(3)

        # Check the calls made to the mock
        calls_made = mock_post.call_args_list
        trace_calls = [call for call in calls_made if "/traces/batch" in str(call)]
        span_calls = [call for call in calls_made if "/spans/batch" in str(call)]

        # With the fix, when trace_id is provided, we should NOT create a new trace
        assert len(trace_calls) == 0, f"Expected 0 trace calls when attaching to existing trace, but got {len(trace_calls)}"
        assert len(span_calls) == 1, f"Expected exactly 1 span call, but got {len(span_calls)}"

        # Verify span has correct trace_id and parent_span_id
        span_payload = span_calls[0][1]['json']['spans'][0]
        assert span_payload['trace_id'] == existing_trace_id, f"Expected trace_id to be {existing_trace_id}, but got {span_payload['trace_id']}"
        assert span_payload['parent_span_id'] == existing_parent_span_id, f"Expected parent_span_id to be {existing_parent_span_id}, but got {span_payload['parent_span_id']}"
        assert "test-attach-span" in span_payload['tags'], f"Expected 'test-attach-span' tag in {span_payload['tags']}"

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")


def test_opik_create_new_trace():
    """
    Test normal trace creation when no trace_id is provided

    - When NO trace_id is provided, create both a new trace and a new span
    - Verify the span references the created trace
    - Verify tags are included in both trace and span
    """
    try:
        from litellm.integrations.opik.opik import OpikLogger

        os.environ["OPIK_URL_OVERRIDE"] = "https://fake.comet.com/opik/api"
        os.environ["OPIK_API_KEY"] = "anything"
        os.environ["OPIK_WORKSPACE"] = "anything"

        # Initialize OpikLogger
        test_opik_logger = OpikLogger()
        litellm.callbacks = [test_opik_logger]
        litellm.set_verbose = True

        # Create a mock for the sync client's post method
        mock_post = Mock()
        mock_post.return_value.status_code = 204
        mock_post.return_value.text = "Accepted"
        test_opik_logger.sync_httpx_client.post = mock_post

        # Make a completion call WITHOUT providing trace_id
        response = litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Test message"}],
            max_tokens=10,
            temperature=0.2,
            mock_response="This is a mock response",
            metadata={
                "opik": {
                    "tags": ["test-new-trace"]
                }
            }
        )

        # Need to wait for a short amount of time as the log_success callback is called in a different thread
        time.sleep(3)

        # Check the calls made to the mock
        calls_made = mock_post.call_args_list
        trace_calls = [call for call in calls_made if "/traces/batch" in str(call)]
        span_calls = [call for call in calls_made if "/spans/batch" in str(call)]

        # Without trace_id provided, we should create both a new trace and a new span
        assert len(trace_calls) == 1, f"Expected exactly 1 trace call, but got {len(trace_calls)}"
        assert len(span_calls) == 1, f"Expected exactly 1 span call, but got {len(span_calls)}"

        # Verify the span references the created trace
        trace_payload = trace_calls[0][1]['json']['traces'][0]
        span_payload = span_calls[0][1]['json']['spans'][0]
        assert span_payload['trace_id'] == trace_payload['id'], "Span should reference the created trace"

        # Verify tags are included in both trace and span
        assert "test-new-trace" in trace_payload['tags'], f"Expected 'test-new-trace' tag in trace tags"
        assert "test-new-trace" in span_payload['tags'], f"Expected 'test-new-trace' tag in span tags"

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")