litellm/tests/load_tests/memory_leak_utils.py

"""
Memory Leak Testing Utilities

This module provides reusable utilities, fixtures, and helpers for memory leak
and OOM (Out of Memory) detection tests. It includes:
- Mock server setup for local testing
- Memory tracking fixtures
- Router fixtures configured for testing
- Helper functions for running memory baseline tests

Usage:
    from tests.load_tests.memory_leak_utils import (
        mock_server,
        limit_memory,
        test_router,
        run_memory_baseline_test,
    )
"""

import gc
import os
import socket
import sys
import time
from threading import Thread

# Add parent directory to path to import litellm (same pattern as other tests)
filepath = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.abspath(os.path.join(filepath, "../..")))

import httpx
import pytest
import psutil
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from litellm.router import Router

# Test Configuration Constants
TEST_API_KEY = "sk-1234"
TEST_MODEL_NAME = "gpt-3.5-turbo"

# Timing Constants (seconds)
GC_STABILIZATION_DELAY = 0.05


# Mock OpenAI-compatible server
def create_mock_server():
    """Create a simple FastAPI mock server that mimics OpenAI API responses."""
    app = FastAPI()

    @app.post("/v1/chat/completions")
    @app.post("/chat/completions")
    async def chat_completions(request: Request):
        """Mock OpenAI chat completions endpoint."""
        request_data = await request.json()
        # Return a simple mock response
        return JSONResponse(
            {
                "id": "chatcmpl-mock",
                "object": "chat.completion",
                "created": int(time.time()),
                "model": request_data.get("model", TEST_MODEL_NAME),
                "choices": [
                    {
                        "index": 0,
                        "message": {"role": "assistant", "content": "Mock response"},
                        "finish_reason": "stop",
                    }
                ],
                "usage": {
                    "prompt_tokens": 10,
                    "completion_tokens": 5,
                    "total_tokens": 15,
                },
            }
        )

    # Catch-all route to see what URLs are being requested
    @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
    async def catch_all(request: Request, path: str):
        """Catch-all route to debug what URLs are being requested."""
        print(f"[Mock Server] Received request: {request.method} {request.url.path}")
        # For non-chat-completions, return 404
        return JSONResponse({"detail": "Not Found"}, status_code=404)

    return app


def run_server(app, port):
    """Run uvicorn server in a thread."""
    import uvicorn

    # Use uvicorn.run which blocks - this is fine in a daemon thread
    uvicorn.run(app, host="127.0.0.1", port=port, log_level="error", access_log=False)


@pytest.fixture(scope="session")
def mock_server():
    """Start a mock server in a separate thread for the test session.

    Yields the server URL (with trailing slash) for use in router configuration.
    """
    app = create_mock_server()
    port = 18888

    # Check if port is already in use
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.bind(("127.0.0.1", port))
        sock.close()
    except OSError:
        # Port already in use, try next port
        port = 18889
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            sock.bind(("127.0.0.1", port))
            sock.close()
        except OSError:
            pytest.fail(
                f"Could not find available port for mock server (tried 18888, 18889)"
            )

    # Start server in background thread
    thread = Thread(target=lambda: run_server(app, port), daemon=True)
    thread.start()

    # Wait for server to start and verify it's accessible
    # Ensure api_base has trailing slash (LiteLLM appends /v1/chat/completions)
    server_url = f"http://127.0.0.1:{port}/"
    max_attempts = 20  # More attempts to ensure server is ready
    server_ready = False
    for attempt in range(max_attempts):
        time.sleep(0.3)  # Longer wait between attempts
        try:
            # Test the actual endpoint we'll use (LiteLLM appends /v1/chat/completions to api_base)
            response = httpx.post(
                f"{server_url}v1/chat/completions",
                json={
                    "model": TEST_MODEL_NAME,
                    "messages": [{"role": "user", "content": "test"}],
                },
                timeout=2.0,
            )
            if response.status_code == 200:
                server_ready = True
                print(f"[Mock Server] Server ready at {server_url}")
                break
        except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as e:
            # Server not ready yet, continue waiting
            if attempt == max_attempts - 1:
                pytest.fail(
                    f"Mock server failed to start on {server_url} after {max_attempts} attempts. "
                    f"Could not connect to /v1/chat/completions endpoint. Error: {e}"
                )
            continue
        except Exception as e:
            # Other errors might indicate server is up but endpoint has issues
            # If we get a response (even error), server is running
            print(f"[Mock Server] Server responded with error (but is running): {e}")
            server_ready = True
            break

    if not server_ready:
        pytest.fail(
            f"Mock server not accessible at {server_url} after {max_attempts} attempts"
        )

    yield server_url

    # Server will be cleaned up when thread dies (daemon=True)


@pytest.fixture
def limit_memory(request):
    """Fixture to track memory usage and enforce limits via @pytest.mark.limit_leaks marker.

    Usage:
        @pytest.mark.limit_leaks("40 MB")
        def test_something(limit_memory):
            # Test code here
            # Memory will be measured and test will fail if increase exceeds limit
    """
    marker = request.node.get_closest_marker("limit_leaks")
    if marker:
        # Parse limit from marker (e.g., "30 MB" -> 30)
        limit_str = marker.args[0] if marker.args else "100 MB"
        limit_mb = float(limit_str.split()[0])
        limit_bytes = limit_mb * 1024 * 1024

        # Measure baseline memory (router will be fresh from fixture)
        process = psutil.Process(os.getpid())
        baseline_memory = process.memory_info().rss

        yield

        # Force GC before measuring final memory
        gc.collect()
        # Small delay for memory to stabilize
        time.sleep(GC_STABILIZATION_DELAY)

        # Measure final memory after test
        final_memory = process.memory_info().rss
        memory_increase = final_memory - baseline_memory
        memory_increase_mb = memory_increase / 1024 / 1024

        # Print memory stats
        print(f"\n[Memory Limit Test] Memory usage:")
        print(f"  Baseline: {baseline_memory / 1024 / 1024:.2f} MB")
        print(f"  Final: {final_memory / 1024 / 1024:.2f} MB")
        print(f"  Increase: {memory_increase_mb:+.2f} MB")
        print(f"  Limit: {limit_mb:.2f} MB")

        # Fail if memory increase exceeds limit
        if memory_increase > limit_bytes:
            pytest.fail(
                f"Memory limit exceeded: {memory_increase_mb:.2f} MB increase > {limit_mb:.2f} MB limit. "
                f"Baseline: {baseline_memory / 1024 / 1024:.2f} MB, Final: {final_memory / 1024 / 1024:.2f} MB"
            )
    else:
        yield


@pytest.fixture
def test_router(mock_server):
    """Fixture to create a fresh router instance for each test.

    Uses the mock server fixture to avoid external API calls.
    Disables cooldowns to prevent deployments from being marked unavailable.

    Usage:
        def test_something(test_router, limit_memory):
            # Use test_router for making requests
            response = await test_router.acompletion(...)
    """
    router = Router(
        model_list=[
            {
                "model_name": TEST_MODEL_NAME,
                "litellm_params": {
                    "model": f"openai/{TEST_MODEL_NAME}",
                    "api_base": mock_server,
                    "api_key": TEST_API_KEY,
                },
            },
        ],
        disable_cooldowns=True,  # Disable cooldowns for testing
        allowed_fails=1000,  # Allow many failures before cooldown (effectively disabled)
    )
    yield router
    # Cleanup after test
    try:
        router.discard()
    except Exception:
        pass  # Ignore cleanup errors


async def run_memory_baseline_test(num_requests: int, router: Router, limit_memory):
    """Helper function to run memory baseline test with specified number of requests.

    Makes requests concurrently in batches for speed, with proper error handling
    that doesn't fail the test on individual request failures.

    Args:
        num_requests: Number of requests to make.
        router: Router instance to use for requests.
        limit_memory: Pytest fixture for memory tracking (reference to suppress linter warning).

    Example:
        @pytest.mark.asyncio
        @pytest.mark.limit_leaks("40 MB")
        async def test_memory(test_router, limit_memory):
            await run_memory_baseline_test(1000, test_router, limit_memory)
    """
    # Fixture is used automatically by pytest - reference it to suppress linter warning
    _ = limit_memory

    # Make requests concurrently in batches for speed
    # Batch size of 20 provides good balance between speed and memory pressure
    BATCH_SIZE = 20

    for batch_start in range(0, num_requests, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, num_requests)
        # Create concurrent tasks for this batch
        tasks = [
            router.acompletion(
                model=TEST_MODEL_NAME,
                messages=[{"role": "user", "content": f"Test request {i}"}],
            )
            for i in range(batch_start, batch_end)
        ]
        # Execute batch concurrently
        # Note: return_exceptions=True allows test to continue even if some requests fail
        import asyncio

        responses = await asyncio.gather(*tasks, return_exceptions=True)
        # Filter out failed requests but continue with test
        valid_responses = []
        failed_count = 0
        for i, response in enumerate(responses):
            if isinstance(response, Exception):
                failed_count += 1
                # Log exception but continue
                print(
                    f"  Warning: Request {batch_start + i} failed: {type(response).__name__}: {response}"
                )
            elif response is None:
                failed_count += 1
                print(f"  Warning: Request {batch_start + i} returned None")
            else:
                valid_responses.append(response)

        # Continue with valid responses - don't fail the test
        # If all failed, that's logged but test continues (might indicate bigger issue)
        if failed_count > 0:
            print(
                f"  Note: {failed_count}/{len(responses)} requests failed in batch {batch_start}-{batch_end}, continuing with {len(valid_responses)} valid responses"
            )

        # Use valid_responses for cleanup
        responses = valid_responses
        # Clean up batch
        del responses
        del tasks
        del valid_responses
        # GC after each batch to prevent accumulation
        gc.collect()

    print(f"[Simple Memory Test] Completed {num_requests} requests")