#!/usr/bin/env python3 """ Benchmark script comparing LiteLLM proxy vs direct provider endpoint. Makes parallel calls to each endpoint and compares statistics including latency, throughput, and success rates. USAGE EXAMPLES: 1. Basic Usage (Sequential, Recommended): # Set required environment variables export LITELLM_PROXY_URL='http://localhost:4000/chat/completions' export PROVIDER_URL='https://api.openai.com/v1/chat/completions' export LITELLM_PROXY_API_KEY='sk-1234' export PROVIDER_API_KEY='sk-openai-key' # Run from scripts directory cd scripts python benchmark_proxy_vs_provider.py 2. Multiple Runs for Statistical Accuracy: python benchmark_proxy_vs_provider.py --runs 5 # Averages results across 5 runs for more reliable metrics 3. Realistic Load Testing with Concurrency Limit: python benchmark_proxy_vs_provider.py --max-concurrent 100 --requests 2000 # Limits to 100 concurrent requests (prevents overwhelming the server) 4. Quick Test with Fewer Requests: python benchmark_proxy_vs_provider.py --requests 100 # Faster test with 100 requests instead of default 1000 5. Parallel Execution (Not Recommended): python benchmark_proxy_vs_provider.py --parallel # Runs both benchmarks simultaneously (may affect accuracy) 6. Custom Timeout: python benchmark_proxy_vs_provider.py --timeout 120 # Sets request timeout to 120 seconds 7. Combined Options: python benchmark_proxy_vs_provider.py --runs 3 --requests 500 --max-concurrent 50 # 3 runs, 500 requests each, max 50 concurrent REQUIRED ENVIRONMENT VARIABLES: - LITELLM_PROXY_URL: Full URL to LiteLLM proxy chat completions endpoint - PROVIDER_URL: Full URL to direct provider chat completions endpoint OPTIONAL ENVIRONMENT VARIABLES: - LITELLM_PROXY_API_KEY: API key for LiteLLM proxy (if auth required) - PROVIDER_API_KEY: API key for direct provider (if auth required) OUTPUT: The script provides detailed statistics including: - Success/error rates - Latency metrics (mean, median, p95, p99) - Throughput (requests per second) - Comparison between proxy and provider performance - Run-to-run variance (when using --runs > 1) """ import asyncio import aiohttp import time import json import argparse import os from typing import List, Dict, Any, Optional from dataclasses import dataclass, field from statistics import mean, median, stdev import sys from aiohttp import TCPConnector @dataclass class RequestStats: """Statistics for a single request""" success: bool latency: float error: str = "" status_code: int = 0 @dataclass class BenchmarkResults: """Aggregated benchmark results""" total_requests: int = 0 successful_requests: int = 0 failed_requests: int = 0 latencies: List[float] = field(default_factory=list) errors: List[str] = field(default_factory=list) status_codes: Dict[int, int] = field(default_factory=dict) total_time: float = 0.0 def calculate_stats(self) -> Dict[str, Any]: """Calculate statistics from the results""" if not self.latencies: return { "total_requests": self.total_requests, "successful_requests": self.successful_requests, "failed_requests": self.failed_requests, "success_rate": 0.0, "error_rate": 1.0, "total_time": self.total_time, "requests_per_second": 0.0, "status_codes": self.status_codes, "unique_errors": len(set(self.errors)) if self.errors else 0, } return { "total_requests": self.total_requests, "successful_requests": self.successful_requests, "failed_requests": self.failed_requests, "success_rate": (self.successful_requests / self.total_requests) * 100, "error_rate": (self.failed_requests / self.total_requests) * 100, "total_time": self.total_time, "requests_per_second": self.total_requests / self.total_time if self.total_time > 0 else 0, "latency_stats": { "mean": mean(self.latencies), "median": median(self.latencies), "min": min(self.latencies), "max": max(self.latencies), "std_dev": stdev(self.latencies) if len(self.latencies) > 1 else 0.0, "p50": median(self.latencies), "p95": self._percentile(self.latencies, 95), "p99": self._percentile(self.latencies, 99), }, "status_codes": self.status_codes, "unique_errors": len(set(self.errors)) if self.errors else 0, } @staticmethod def _percentile(data: List[float], percentile: int) -> float: """Calculate percentile""" sorted_data = sorted(data) index = int(len(sorted_data) * (percentile / 100)) if index >= len(sorted_data): index = len(sorted_data) - 1 return sorted_data[index] async def make_request( session: aiohttp.ClientSession, url: str, headers: Dict[str, str], payload: Dict[str, Any], timeout: aiohttp.ClientTimeout, ) -> RequestStats: """Make a single async request and return stats""" # Use time.perf_counter() for higher precision timing start_time = time.perf_counter() try: async with session.post(url, json=payload, headers=headers, timeout=timeout) as response: # Read response body to ensure complete transfer response_body = await response.read() latency = time.perf_counter() - start_time status_code = response.status if response.status == 200: # Validate response is valid JSON try: json.loads(response_body) except json.JSONDecodeError: return RequestStats( success=False, latency=latency, error="Invalid JSON response", status_code=status_code, ) return RequestStats( success=True, latency=latency, status_code=status_code, ) else: error_text = response_body.decode('utf-8', errors='ignore')[:100] return RequestStats( success=False, latency=latency, error=f"HTTP {status_code}: {error_text}", status_code=status_code, ) except asyncio.TimeoutError: latency = time.perf_counter() - start_time return RequestStats( success=False, latency=latency, error="Timeout", status_code=0, ) except Exception as e: latency = time.perf_counter() - start_time return RequestStats( success=False, latency=latency, error=str(e)[:100], status_code=0, ) async def warmup_endpoint( url: str, headers: Dict[str, str], payload: Dict[str, Any], num_warmup: int = 5, timeout_seconds: int = 60, ) -> None: """Perform warm-up requests to avoid cold start penalties""" timeout = aiohttp.ClientTimeout(total=timeout_seconds) connector = TCPConnector( limit=100, # Max connections limit_per_host=50, # Max connections per host ttl_dns_cache=300, # DNS cache TTL force_close=False, # Reuse connections ) async with aiohttp.ClientSession(connector=connector) as session: tasks = [ make_request(session, url, headers, payload, timeout) for _ in range(num_warmup) ] await asyncio.gather(*tasks, return_exceptions=True) # Brief pause after warmup to let connections stabilize await asyncio.sleep(0.5) async def make_request_with_semaphore( session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, url: str, headers: Dict[str, str], payload: Dict[str, Any], timeout: aiohttp.ClientTimeout, ) -> RequestStats: """Make a request with semaphore-based concurrency control""" async with semaphore: return await make_request(session, url, headers, payload, timeout) async def benchmark_endpoint( url: str, headers: Dict[str, str], payload: Dict[str, Any], num_requests: int = 1000, timeout_seconds: int = 60, warmup: bool = True, max_concurrent: Optional[int] = None, ) -> BenchmarkResults: """Benchmark an endpoint with parallel requests Args: url: Endpoint URL to benchmark headers: HTTP headers payload: Request payload num_requests: Total number of requests to make timeout_seconds: Request timeout warmup: Whether to perform warm-up requests max_concurrent: Maximum concurrent requests (None = unlimited, all at once) """ print(f"\nStarting benchmark for {url}") if warmup: print(f" Warming up with 5 requests...") await warmup_endpoint(url, headers, payload, num_warmup=5, timeout_seconds=timeout_seconds) if max_concurrent: print(f" Making {num_requests} requests with max {max_concurrent} concurrent...") else: print(f" Making {num_requests} requests in parallel (unlimited concurrency)...") results = BenchmarkResults(total_requests=num_requests) timeout = aiohttp.ClientTimeout(total=timeout_seconds) # Set connector limits based on concurrency if max_concurrent: connector_limit = min(max_concurrent * 2, 200) # Allow some headroom connector_limit_per_host = max_concurrent else: connector_limit = 200 connector_limit_per_host = 100 # Use optimized connector for connection pooling and reuse connector = TCPConnector( limit=connector_limit, limit_per_host=connector_limit_per_host, ttl_dns_cache=300, # DNS cache TTL (5 minutes) force_close=False, # Reuse connections for better performance enable_cleanup_closed=True, # Clean up closed connections ) # Use time.perf_counter() for higher precision start_time = time.perf_counter() async with aiohttp.ClientSession(connector=connector) as session: if max_concurrent: # Use semaphore to limit concurrency semaphore = asyncio.Semaphore(max_concurrent) tasks = [ make_request_with_semaphore(session, semaphore, url, headers, payload, timeout) for _ in range(num_requests) ] else: # Create all tasks at once for maximum parallelism tasks = [ make_request(session, url, headers, payload, timeout) for _ in range(num_requests) ] # Execute all requests (with concurrency limit if specified) request_stats = await asyncio.gather(*tasks) results.total_time = time.perf_counter() - start_time # Aggregate results for stats in request_stats: if stats.success: results.successful_requests += 1 results.latencies.append(stats.latency) else: results.failed_requests += 1 results.errors.append(stats.error) if stats.status_code > 0: results.status_codes[stats.status_code] = results.status_codes.get(stats.status_code, 0) + 1 return results def print_results(name: str, results: BenchmarkResults): """Print formatted benchmark results""" stats = results.calculate_stats() print(f"\n{'='*60}") print(f"Results for {name}") print(f"{'='*60}") print(f"Total Requests: {stats['total_requests']}") print(f"Successful Requests: {stats['successful_requests']}") print(f"Failed Requests: {stats['failed_requests']}") print(f"Success Rate: {stats['success_rate']:.2f}%") print(f"Error Rate: {stats['error_rate']:.2f}%") print(f"Total Time: {stats['total_time']:.2f}s") print(f"Requests/Second: {stats['requests_per_second']:.2f}") if 'latency_stats' in stats: latency = stats['latency_stats'] print(f"\nLatency Statistics (seconds):") print(f" Mean: {latency['mean']:.4f}s") print(f" Median (p50): {latency['median']:.4f}s") print(f" Min: {latency['min']:.4f}s") print(f" Max: {latency['max']:.4f}s") print(f" Std Dev: {latency['std_dev']:.4f}s") print(f" p95: {latency['p95']:.4f}s") print(f" p99: {latency['p99']:.4f}s") if stats['status_codes']: print(f"\nStatus Codes:") for code, count in sorted(stats['status_codes'].items()): print(f" {code}: {count}") if results.errors: print(f"\nErrors (showing first 5 unique):") unique_errors = list(set(results.errors))[:5] for error in unique_errors: count = results.errors.count(error) print(f" [{count}x] {error}") def aggregate_results(results_list: List[BenchmarkResults]) -> BenchmarkResults: """Aggregate results from multiple runs""" if not results_list: return BenchmarkResults() aggregated = BenchmarkResults() # Aggregate all latencies all_latencies = [] all_errors = [] total_requests = 0 total_successful = 0 total_failed = 0 total_time_sum = 0.0 status_codes_combined = {} for result in results_list: all_latencies.extend(result.latencies) all_errors.extend(result.errors) total_requests += result.total_requests total_successful += result.successful_requests total_failed += result.failed_requests total_time_sum += result.total_time for code, count in result.status_codes.items(): status_codes_combined[code] = status_codes_combined.get(code, 0) + count aggregated.latencies = all_latencies aggregated.errors = all_errors aggregated.total_requests = total_requests aggregated.successful_requests = total_successful aggregated.failed_requests = total_failed aggregated.total_time = total_time_sum / len(results_list) # Average time aggregated.status_codes = status_codes_combined return aggregated def print_run_variance(name: str, results_list: List[BenchmarkResults]): """Print variance statistics across multiple runs""" if len(results_list) <= 1: return print(f"\n{'='*60}") print(f"Run-to-Run Variance: {name}") print(f"{'='*60}") # Collect mean latencies from each run mean_latencies = [] throughputs = [] for result in results_list: stats = result.calculate_stats() if 'latency_stats' in stats: mean_latencies.append(stats['latency_stats']['mean']) throughputs.append(stats['requests_per_second']) if mean_latencies: print(f"\nMean Latency Variance:") print(f" Runs: {len(mean_latencies)}") print(f" Mean: {mean(mean_latencies):.4f}s") print(f" Min: {min(mean_latencies):.4f}s") print(f" Max: {max(mean_latencies):.4f}s") print(f" Std Dev: {stdev(mean_latencies):.4f}s" if len(mean_latencies) > 1 else " Std Dev: N/A") print(f" Coefficient of Variation: {(stdev(mean_latencies) / mean(mean_latencies) * 100):.2f}%" if len(mean_latencies) > 1 else " Coefficient of Variation: N/A") if throughputs: print(f"\nThroughput Variance:") print(f" Mean: {mean(throughputs):.2f} req/s") print(f" Min: {min(throughputs):.2f} req/s") print(f" Max: {max(throughputs):.2f} req/s") print(f" Std Dev: {stdev(throughputs):.2f} req/s" if len(throughputs) > 1 else " Std Dev: N/A") def compare_results(proxy_results: BenchmarkResults, provider_results: BenchmarkResults): """Compare and print differences between proxy and provider results""" proxy_stats = proxy_results.calculate_stats() provider_stats = provider_results.calculate_stats() print(f"\n{'='*60}") print(f"Comparison: LiteLLM Proxy vs Direct Provider") print(f"{'='*60}") # Success Rate Comparison print(f"\nSuccess Rate:") print(f" Proxy: {proxy_stats['success_rate']:.2f}%") print(f" Provider: {provider_stats['success_rate']:.2f}%") diff = proxy_stats['success_rate'] - provider_stats['success_rate'] print(f" Difference: {diff:+.2f}%") # Throughput Comparison print(f"\nThroughput (requests/second):") print(f" Proxy: {proxy_stats['requests_per_second']:.2f}") print(f" Provider: {provider_stats['requests_per_second']:.2f}") diff = proxy_stats['requests_per_second'] - provider_stats['requests_per_second'] print(f" Difference: {diff:+.2f} req/s") # Latency Comparison if 'latency_stats' in proxy_stats and 'latency_stats' in provider_stats: print(f"\nLatency Comparison (seconds):") proxy_latency = proxy_stats['latency_stats'] provider_latency = provider_stats['latency_stats'] metrics = ['mean', 'median', 'p95', 'p99'] for metric in metrics: proxy_val = proxy_latency[metric] provider_val = provider_latency[metric] diff = proxy_val - provider_val diff_pct = (diff / provider_val * 100) if provider_val > 0 else 0 print(f" {metric.upper():8s}: Proxy={proxy_val:.4f}s, Provider={provider_val:.4f}s, Diff={diff:+.4f}s ({diff_pct:+.2f}%)") # Total Time Comparison print(f"\nTotal Time:") print(f" Proxy: {proxy_stats['total_time']:.2f}s") print(f" Provider: {provider_stats['total_time']:.2f}s") diff = proxy_stats['total_time'] - provider_stats['total_time'] diff_pct = (diff / provider_stats['total_time'] * 100) if provider_stats['total_time'] > 0 else 0 print(f" Difference: {diff:+.2f}s ({diff_pct:+.2f}%)") async def main(): """Main benchmark function""" parser = argparse.ArgumentParser( description="Benchmark LiteLLM proxy vs direct provider endpoint", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Environment Variables (required): LITELLM_PROXY_URL - URL of the LiteLLM proxy endpoint (e.g., http://localhost:4000/chat/completions) PROVIDER_URL - URL of the direct provider endpoint (e.g., https://api.openai.com/v1/chat/completions) LITELLM_PROXY_API_KEY - API key for LiteLLM proxy (optional, but may be required) PROVIDER_API_KEY - API key for direct provider (optional, but may be required) Examples: # 1. Basic usage (recommended - sequential execution) export LITELLM_PROXY_URL='http://localhost:4000/chat/completions' export PROVIDER_URL='https://api.openai.com/v1/chat/completions' export LITELLM_PROXY_API_KEY='sk-1234' export PROVIDER_API_KEY='sk-openai-key' python scripts/benchmark_proxy_vs_provider.py # 2. Multiple runs for statistical accuracy (recommended) python scripts/benchmark_proxy_vs_provider.py --runs 5 # 3. Realistic load testing with concurrency limit python scripts/benchmark_proxy_vs_provider.py --max-concurrent 100 --requests 2000 # 4. Quick test with fewer requests python scripts/benchmark_proxy_vs_provider.py --requests 100 # 5. Parallel execution (not recommended - may affect accuracy) python scripts/benchmark_proxy_vs_provider.py --parallel # 6. Custom timeout for slower endpoints python scripts/benchmark_proxy_vs_provider.py --timeout 120 # 7. Combined options for comprehensive testing python scripts/benchmark_proxy_vs_provider.py --runs 3 --requests 500 --max-concurrent 50 # 8. Skip warmup (not recommended - may affect first request accuracy) python scripts/benchmark_proxy_vs_provider.py --no-warmup """ ) parser.add_argument( "--parallel", action="store_true", help="Run both benchmarks in parallel (default: sequential to avoid interference)", ) parser.add_argument( "--requests", type=int, default=1000, help="Number of requests per endpoint (default: 1000)", ) parser.add_argument( "--timeout", type=int, default=60, help="Request timeout in seconds (default: 60)", ) parser.add_argument( "--runs", type=int, default=1, help="Number of benchmark runs to average (default: 1, recommended: 3-5 for accuracy)", ) parser.add_argument( "--no-warmup", action="store_true", help="Skip warm-up requests (not recommended)", ) parser.add_argument( "--max-concurrent", type=int, default=None, help="Maximum concurrent requests (default: unlimited - all at once). " "Useful for realistic load testing (e.g., --max-concurrent 100)", ) args = parser.parse_args() # Configuration from environment variables LITELLM_PROXY_URL = os.getenv("LITELLM_PROXY_URL") PROVIDER_URL = os.getenv("PROVIDER_URL") LITELLM_PROXY_API_KEY = os.getenv("LITELLM_PROXY_API_KEY", "") PROVIDER_API_KEY = os.getenv("PROVIDER_API_KEY", "") # Validate required environment variables if not LITELLM_PROXY_URL: print("Error: LITELLM_PROXY_URL environment variable is required") print(" Example: export LITELLM_PROXY_URL='https://your-proxy.com/chat/completions'") sys.exit(1) if not PROVIDER_URL: print("Error: PROVIDER_URL environment variable is required") print(" Example: export PROVIDER_URL='https://your-provider.com/v1/chat/completions'") sys.exit(1) # Headers for LiteLLM proxy proxy_headers = { "Content-Type": "application/json", } if LITELLM_PROXY_API_KEY: proxy_headers["Authorization"] = f"Bearer {LITELLM_PROXY_API_KEY}" else: print("Warning: LITELLM_PROXY_API_KEY not set, requests may fail if authentication is required") # Headers for direct provider provider_headers = { "Content-Type": "application/json", } if PROVIDER_API_KEY: provider_headers["Authorization"] = f"Bearer {PROVIDER_API_KEY}" else: print("Warning: PROVIDER_API_KEY not set, requests may fail if authentication is required") # Payload (same for both) payload = { "model": "db-openai-endpoint", # For proxy "messages": [ { "role": "user", "content": "Hello, how are you?" } ], "max_tokens": 100, "user": "new_user" } # For direct provider, might need different model name provider_payload = payload.copy() # provider_payload["model"] = "gpt-3.5-turbo" # Uncomment if needed num_requests = args.requests timeout_seconds = args.timeout print("="*60) print("LiteLLM Proxy vs Provider Benchmark") print("="*60) print(f"Configuration (from environment variables):") print(f" Proxy URL: {LITELLM_PROXY_URL}") print(f" Provider URL: {PROVIDER_URL}") print(f" Proxy API Key: {'Set' if LITELLM_PROXY_API_KEY else 'Not set (may cause auth errors)'}") print(f" Provider API Key: {'Set' if PROVIDER_API_KEY else 'Not set (may cause auth errors)'}") print(f" Requests: {num_requests}") print(f" Runs: {args.runs}") print(f" Max Concurrent: {args.max_concurrent if args.max_concurrent else 'Unlimited (all at once)'}") print(f" Timeout: {timeout_seconds}s") print(f" Warmup: {'Enabled' if not args.no_warmup else 'Disabled (not recommended)'}") print(f" Mode: {'Parallel (may affect results)' if args.parallel else 'Sequential (recommended)'}") if not args.max_concurrent: print(f"\nTip: Use --max-concurrent 100 for more realistic load testing") print(f" (prevents overwhelming the server with all requests at once)") if args.parallel: print(f"\nWARNING: Running benchmarks in parallel may affect results due to:") print(f" - Shared network bandwidth") print(f" - Provider endpoint receiving double load (via proxy + direct)") print(f" - Potential rate limiting issues") print(f" - Resource contention") # Run benchmarks multiple times if requested all_proxy_results = [] all_provider_results = [] warmup_enabled = not args.no_warmup if args.runs > 1: print(f"\nRunning {args.runs} benchmark runs for statistical accuracy...") print(f" Results will be averaged across all runs.\n") overall_start_time = time.perf_counter() # Initialize to satisfy type checker (will always be set in loop) proxy_results: Optional[BenchmarkResults] = None provider_results: Optional[BenchmarkResults] = None for run_num in range(1, args.runs + 1): if args.runs > 1: print(f"\n{'='*60}") print(f"Run {run_num}/{args.runs}") print(f"{'='*60}") if args.parallel: print(f"\nRunning both benchmarks in parallel...") proxy_results, provider_results = await asyncio.gather( benchmark_endpoint( LITELLM_PROXY_URL, proxy_headers, payload, num_requests, timeout_seconds, warmup=warmup_enabled and run_num == 1, # Only warmup on first run max_concurrent=args.max_concurrent, ), benchmark_endpoint( PROVIDER_URL, provider_headers, provider_payload, num_requests, timeout_seconds, warmup=warmup_enabled and run_num == 1, # Only warmup on first run max_concurrent=args.max_concurrent, ), ) else: print(f"\nRunning benchmarks sequentially (proxy first, then provider)...") if run_num == 1: print(f" This ensures accurate results without interference.\n") proxy_results = await benchmark_endpoint( LITELLM_PROXY_URL, proxy_headers, payload, num_requests, timeout_seconds, warmup=warmup_enabled and run_num == 1, # Only warmup on first run max_concurrent=args.max_concurrent, ) if run_num < args.runs or args.runs == 1: print(f"\nWaiting 3 seconds before starting provider benchmark...") await asyncio.sleep(3) # Longer pause to ensure clean separation provider_results = await benchmark_endpoint( PROVIDER_URL, provider_headers, provider_payload, num_requests, timeout_seconds, warmup=warmup_enabled and run_num == 1, # Only warmup on first run max_concurrent=args.max_concurrent, ) all_proxy_results.append(proxy_results) all_provider_results.append(provider_results) # Brief pause between runs if run_num < args.runs: print(f"\nWaiting 5 seconds before next run...") await asyncio.sleep(5) overall_benchmark_time = time.perf_counter() - overall_start_time print(f"\nAll benchmark runs completed in {overall_benchmark_time:.2f}s") # Aggregate results across multiple runs if args.runs > 1: final_proxy_results = aggregate_results(all_proxy_results) final_provider_results = aggregate_results(all_provider_results) print(f"\nAggregated results across {args.runs} runs:") else: # Use results from single run if proxy_results is None or provider_results is None: raise RuntimeError("Benchmark results not initialized") final_proxy_results = proxy_results final_provider_results = provider_results print(f"\nResults:") # Print individual results print_results("LiteLLM Proxy", final_proxy_results) print_results("Direct Provider", final_provider_results) # Print comparison compare_results(final_proxy_results, final_provider_results) # Show run-to-run variance if multiple runs if args.runs > 1: print_run_variance("LiteLLM Proxy", all_proxy_results) print_run_variance("Direct Provider", all_provider_results) print(f"\n{'='*60}") print("Benchmark complete!") print(f"{'='*60}\n") if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: print("\n\nBenchmark interrupted by user") sys.exit(1) except Exception as e: print(f"\n\nError running benchmark: {e}") import traceback traceback.print_exc() sys.exit(1)