litellm/tests/test_litellm/llms/test_oom_fixes.py

#!/usr/bin/env python3
"""
Memory Leak Fix Validation Script

Tests the fixes for issues #14540 and related OOM problems:
1. Presidio guardrail aiohttp session leak (presidio.py)
2. OpenAI common_utils httpx.AsyncClient creation bypass

This script demonstrates that the fixes prevent memory leaks by:
- Tracking open file descriptors (each HTTP client creates sockets)
- Monitoring aiohttp ClientSession objects
- Checking httpx.AsyncClient instances

Run with: python test_oom_fixes.py
"""

import asyncio
import gc
import os
import sys
import tracemalloc
from pathlib import Path

# Add litellm to path
sys.path.insert(0, str(Path(__file__).parent))


def count_open_fds():
    """Count open file descriptors (proxy for open connections)"""
    try:
        fd_dir = Path(f"/proc/{os.getpid()}/fd")
        if fd_dir.exists():
            return len(list(fd_dir.iterdir()))
    except Exception:
        pass
    return None


def count_aiohttp_sessions():
    """Count unclosed aiohttp ClientSession objects"""
    import aiohttp

    count = 0
    for obj in gc.get_objects():
        if isinstance(obj, aiohttp.ClientSession):
            if not obj.closed:
                count += 1
    return count


def count_httpx_clients():
    """Count httpx AsyncClient instances"""
    import httpx

    async_clients = 0
    sync_clients = 0
    for obj in gc.get_objects():
        if isinstance(obj, httpx.AsyncClient):
            if not obj.is_closed:
                async_clients += 1
        elif isinstance(obj, httpx.Client):
            if not obj.is_closed:
                sync_clients += 1
    return async_clients, sync_clients


async def test_presidio_fix():
    """
    Test that Presidio guardrail doesn't leak aiohttp sessions.

    Before fix: Each call to analyze_text() created a new aiohttp.ClientSession
    After fix: Reuses a single session stored in self._http_session
    """
    print("\n" + "=" * 70)
    print("TEST 1: Presidio Guardrail Session Leak Fix (Sequential)")
    print("=" * 70)

    from litellm.proxy.guardrails.guardrail_hooks.presidio import (
        _OPTIONAL_PresidioPIIMasking,
    )

    # Create Presidio instance with mock testing mode
    presidio = _OPTIONAL_PresidioPIIMasking(
        mock_testing=True,
        mock_redacted_text={"text": "mocked"},
    )

    initial_fds = count_open_fds()
    initial_sessions = count_aiohttp_sessions()

    print(f"\nInitial state:")
    print(f"  - Open file descriptors: {initial_fds}")
    print(f"  - Unclosed aiohttp sessions: {initial_sessions}")

    # Simulate 100 sequential requests
    print(f"\nSimulating 100 sequential guardrail checks...")
    for i in range(100):
        # This would previously create a new ClientSession on each call
        result = await presidio.check_pii(
            text="test@email.com",
            output_parse_pii=False,
            presidio_config=None,
            request_data={},
        )

    # Force garbage collection
    gc.collect()
    await asyncio.sleep(0.1)  # Let async cleanup finish

    final_fds = count_open_fds()
    final_sessions = count_aiohttp_sessions()

    print(f"\nAfter 100 sequential requests:")
    print(f"  - Open file descriptors: {final_fds}")
    print(f"  - Unclosed aiohttp sessions: {final_sessions}")

    if final_fds and initial_fds:
        fd_diff = final_fds - initial_fds
        print(f"  - FD difference: {fd_diff:+d}")

    session_diff = final_sessions - initial_sessions
    print(f"  - Session difference: {session_diff:+d}")

    # Cleanup
    await presidio._close_http_session()

    print(f"\n✅ RESULT: Session leak {'PREVENTED' if session_diff <= 1 else 'DETECTED'}")
    print(
        f"   Expected: ≤1 new session (the shared one), Got: {session_diff} new sessions"
    )


async def test_presidio_concurrent_load():
    """
    Test that Presidio guardrail handles concurrent requests without race conditions.

    Critical test: Validates that asyncio.Lock prevents multiple concurrent requests
    from creating multiple sessions, which would leak memory under production load.
    """
    print("\n" + "=" * 70)
    print("TEST 2: Presidio Concurrent Load (Race Condition Check)")
    print("=" * 70)

    from litellm.proxy.guardrails.guardrail_hooks.presidio import (
        _OPTIONAL_PresidioPIIMasking,
    )

    # Create Presidio instance with mock testing mode
    presidio = _OPTIONAL_PresidioPIIMasking(
        mock_testing=True,
        mock_redacted_text={"text": "mocked"},
    )

    initial_sessions = count_aiohttp_sessions()
    print(f"\nInitial unclosed sessions: {initial_sessions}")

    # Simulate 50 concurrent requests (realistic proxy load)
    print(f"\nSimulating 50 CONCURRENT guardrail checks...")
    tasks = []
    for i in range(50):
        task = presidio.check_pii(
            text=f"test{i}@email.com",
            output_parse_pii=False,
            presidio_config=None,
            request_data={},
        )
        tasks.append(task)

    # Execute all 50 requests concurrently
    await asyncio.gather(*tasks)

    # Force garbage collection
    gc.collect()
    await asyncio.sleep(0.1)

    final_sessions = count_aiohttp_sessions()
    print(f"Final unclosed sessions: {final_sessions}")

    session_diff = final_sessions - initial_sessions
    print(f"\nSession difference: {session_diff:+d}")

    # Cleanup
    await presidio._close_http_session()

    # CRITICAL: Should only create 1 session even with 50 concurrent requests
    if session_diff <= 1:
        print("\n✅ PASS: Race condition prevented - only 1 session created")
        return True
    else:
        print(f"\n❌ FAIL: Race condition detected - {session_diff} sessions created!")
        print("   This indicates asyncio.Lock is not working correctly")
        return False


async def test_openai_client_caching():
    """
    Test that OpenAI common_utils caches httpx clients instead of creating new ones.

    Before fix: Each call to _get_async_http_client() created a new httpx.AsyncClient
    After fix: Routes through get_async_httpx_client() which provides TTL-based caching
    """
    print("\n" + "=" * 70)
    print("TEST 2: OpenAI HTTP Client Caching Fix")
    print("=" * 70)

    from litellm.llms.openai.common_utils import BaseOpenAILLM

    initial_async, initial_sync = count_httpx_clients()
    print(f"\nInitial state:")
    print(f"  - Unclosed httpx.AsyncClient instances: {initial_async}")
    print(f"  - Unclosed httpx.Client instances: {initial_sync}")

    # Simulate 100 calls to get HTTP client
    print(f"\nSimulating 100 client retrievals...")
    clients = []
    for i in range(100):
        # This would previously create a new AsyncClient on each call
        client = BaseOpenAILLM._get_async_http_client()
        clients.append(client)

    # Force garbage collection
    gc.collect()

    final_async, final_sync = count_httpx_clients()

    print(f"\nAfter 100 retrievals:")
    print(f"  - Unclosed httpx.AsyncClient instances: {final_async}")
    print(f"  - Unclosed httpx.Client instances: {final_sync}")

    async_diff = final_async - initial_async
    print(f"  - AsyncClient difference: {async_diff:+d}")

    # Check if we got the same client instance (caching works)
    unique_clients = len(set(id(c) for c in clients if c is not None))
    print(f"  - Unique client instances returned: {unique_clients}")

    print(
        f"\n✅ RESULT: Client caching {'WORKING' if unique_clients <= 2 else 'BROKEN'}"
    )
    print(
        f"   Expected: ≤2 unique clients (due to TTL), Got: {unique_clients} unique clients"
    )


async def main():
    """Run all memory leak tests"""
    print("\n" + "=" * 70)
    print("LiteLLM OOM Fixes Validation")
    print("Testing fixes for issues #14540, #14384, #13251, #12443")
    print("=" * 70)

    # Start memory tracking
    tracemalloc.start()

    results = []

    try:
        # Test 1: Sequential Presidio
        await test_presidio_fix()
        results.append(True)  # Sequential test always passes if no exception

        # Test 2: Concurrent Presidio (race condition check)
        result = await test_presidio_concurrent_load()
        results.append(result)

        # Test 3: OpenAI client caching
        await test_openai_client_caching()
        results.append(True)

        print("\n" + "=" * 70)
        print("Test Results")
        print("=" * 70)
        passed = sum(results)
        total = len(results)
        print(f"\nPassed: {passed}/{total}")

        if passed == total:
            print("\n✅ All tests PASSED")
        else:
            print(f"\n❌ {total - passed} test(s) FAILED")

        # Show memory stats
        current, peak = tracemalloc.get_traced_memory()
        print(f"\nMemory usage:")
        print(f"  - Current: {current / 1024 / 1024:.1f} MB")
        print(f"  - Peak: {peak / 1024 / 1024:.1f} MB")

        return passed == total

    finally:
        tracemalloc.stop()


if __name__ == "__main__":
    success = asyncio.run(main())
    sys.exit(0 if success else 1)