From 57295cedef003dbf2012b8bb434c8562ab14ab2f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 3 Nov 2025 17:22:19 -0800 Subject: [PATCH] [Feat] Add Azure AI Doc Intelligence OCR (#16219) * TestAzureDocumentIntelligenceOCR * add AZURE_DOCUMENT_INTELLIGENCE_API_VERSION * add AzureDocumentIntelligenceOCRConfig * add async_transform_ocr_response * use async transform * add AzureDocumentIntelligenceOCRConfig * add AzureDocumentIntelligenceOCRConfig * add AzureDocumentIntelligenceOCRConfig * add get_azure_ai_ocr_config * add azure_ai/doc-intelligence * add azure_ai/doc-intelligence * docs fix * docs fix * add azure doc intel * fix lint error --- .../providers/azure_document_intelligence.md | 408 ++++++++++ docs/my-website/docs/providers/azure_ocr.md | 2 +- docs/my-website/sidebars.js | 1 + litellm/constants.py | 6 + litellm/llms/azure_ai/ocr/__init__.py | 10 +- litellm/llms/azure_ai/ocr/common_utils.py | 53 ++ .../ocr/document_intelligence/__init__.py | 5 + .../document_intelligence/transformation.py | 697 ++++++++++++++++++ litellm/llms/base_llm/ocr/transformation.py | 30 + litellm/llms/custom_httpx/llm_http_handler.py | 6 +- ...odel_prices_and_context_window_backup.json | 27 + litellm/utils.py | 8 +- model_prices_and_context_window.json | 27 + provider_endpoints_support.json | 17 + .../test_ocr_azure_document_intelligence.py | 44 ++ 15 files changed, 1334 insertions(+), 7 deletions(-) create mode 100644 docs/my-website/docs/providers/azure_document_intelligence.md create mode 100644 litellm/llms/azure_ai/ocr/common_utils.py create mode 100644 litellm/llms/azure_ai/ocr/document_intelligence/__init__.py create mode 100644 litellm/llms/azure_ai/ocr/document_intelligence/transformation.py create mode 100644 tests/ocr_tests/test_ocr_azure_document_intelligence.py diff --git a/docs/my-website/docs/providers/azure_document_intelligence.md b/docs/my-website/docs/providers/azure_document_intelligence.md new file mode 100644 index 0000000000..edc3c616fa --- /dev/null +++ b/docs/my-website/docs/providers/azure_document_intelligence.md @@ -0,0 +1,408 @@ +# Azure Document Intelligence OCR + +## Overview + +| Property | Details | +|-------|-------| +| Description | Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities including text extraction, layout analysis, and structure recognition | +| Provider Route on LiteLLM | `azure_ai/doc-intelligence/` | +| Supported Operations | `/ocr` | +| Link to Provider Doc | [Azure Document Intelligence ↗](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/) + +Extract text and analyze document structure using Azure Document Intelligence's powerful prebuilt models. + +## Quick Start + +### **LiteLLM SDK** + +```python showLineNumbers title="SDK Usage" +import litellm +import os + +# Set environment variables +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +# OCR with PDF URL +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) + +# Access extracted text +for page in response.pages: + print(f"Page {page.index}:") + print(page.markdown) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-doc-intel + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-layout + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Start Proxy** +```bash +litellm --config proxy_config.yaml +``` + +**Call OCR via Proxy** +```bash showLineNumbers title="cURL Request" +curl -X POST http://localhost:4000/ocr \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "model": "azure-doc-intel", + "document": { + "type": "document_url", + "document_url": "https://arxiv.org/pdf/2201.04234" + } + }' +``` + +## How It Works + +Azure Document Intelligence uses an asynchronous API pattern. LiteLLM AI Gateway handles the request/response transformation and polling automatically. + +### Complete Flow Diagram + +```mermaid +sequenceDiagram + participant Client + box rgb(200, 220, 255) LiteLLM AI Gateway + participant LiteLLM + end + participant Azure as Azure Document Intelligence + + Client->>LiteLLM: POST /ocr (Mistral format) + Note over LiteLLM: Transform to Azure format + + LiteLLM->>Azure: POST :analyze + Azure-->>LiteLLM: 202 Accepted + polling URL + + Note over LiteLLM: Automatic Polling + loop Every 2-10 seconds + LiteLLM->>Azure: GET polling URL + Azure-->>LiteLLM: Status: running + end + + LiteLLM->>Azure: GET polling URL + Azure-->>LiteLLM: Status: succeeded + results + + Note over LiteLLM: Transform to Mistral format + LiteLLM-->>Client: OCR Response (Mistral format) +``` + +### What LiteLLM Does For You + +When you call `litellm.ocr()` via SDK or `/ocr` via Proxy: + +1. **Request Transformation**: Converts Mistral OCR format → Azure Document Intelligence format +2. **Submits Document**: Sends transformed request to Azure DI API +3. **Handles 202 Response**: Captures the `Operation-Location` URL from response headers +4. **Automatic Polling**: + - Polls the operation URL at intervals specified by `retry-after` header (default: 2 seconds) + - Continues until status is `succeeded` or `failed` + - Respects Azure's rate limiting via `retry-after` headers +5. **Response Transformation**: Converts Azure DI format → Mistral OCR format +6. **Returns Result**: Sends unified Mistral format response to client + +**Polling Configuration:** +- Default timeout: 120 seconds +- Configurable via `AZURE_OPERATION_POLLING_TIMEOUT` environment variable +- Uses sync (`time.sleep()`) or async (`await asyncio.sleep()`) based on call type + +:::info +**Typical processing time**: 2-10 seconds depending on document size and complexity +::: + +## Supported Models + +Azure Document Intelligence offers several prebuilt models optimized for different use cases: + +### prebuilt-layout (Recommended) + +Best for general document OCR with structure preservation. + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + +```python showLineNumbers title="Layout Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-layout + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-layout + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-layout", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Features:** +- Text extraction with markdown formatting +- Table detection and extraction +- Document structure analysis +- Paragraph and section recognition + +**Pricing:** $10 per 1,000 pages + +### prebuilt-read + +Optimized for reading text from documents - fastest and most cost-effective. + + + + +```python showLineNumbers title="Read Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-read", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-read + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-read + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-read", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Features:** +- Fast text extraction +- Optimized for reading-heavy documents +- Basic structure recognition + +**Pricing:** $1.50 per 1,000 pages + +### prebuilt-document + +General-purpose document analysis with key-value pairs. + + + + +```python showLineNumbers title="Document Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-document", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-document + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-document + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-document", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Pricing:** $10 per 1,000 pages + +## Document Types + +Azure Document Intelligence supports various document formats. + +### PDF Documents + +```python showLineNumbers title="PDF OCR" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + +### Image Documents + +```python showLineNumbers title="Image OCR" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "image_url", + "image_url": "https://example.com/image.png" + } +) +``` + +**Supported image formats:** JPEG, PNG, BMP, TIFF + +### Base64 Encoded Documents + +```python showLineNumbers title="Base64 PDF" +import base64 + +# Read and encode PDF +with open("document.pdf", "rb") as f: + pdf_base64 = base64.b64encode(f.read()).decode() + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": f"data:application/pdf;base64,{pdf_base64}" + } +) +``` + +## Response Format + +```python showLineNumbers title="Response Structure" +# Response has the following structure +response.pages # List of pages with extracted text +response.model # Model used +response.object # "ocr" +response.usage_info # Token usage information + +# Access page content +for page in response.pages: + print(f"Page {page.index}:") + print(page.markdown) + + # Page dimensions (in pixels) + if page.dimensions: + print(f"Width: {page.dimensions.width}px") + print(f"Height: {page.dimensions.height}px") +``` + +## Async Support + +```python showLineNumbers title="Async Usage" +import litellm +import asyncio + +async def process_document(): + response = await litellm.aocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } + ) + return response + +# Run async function +response = asyncio.run(process_document()) +``` + +## Cost Tracking + +LiteLLM automatically tracks costs for Azure Document Intelligence OCR: + +| Model | Cost per 1,000 Pages | +|-------|---------------------| +| prebuilt-read | $1.50 | +| prebuilt-layout | $10.00 | +| prebuilt-document | $10.00 | + +```python showLineNumbers title="View Cost" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={"type": "document_url", "document_url": "https://..."} +) + +# Access cost information +print(f"Cost: ${response._hidden_params.get('response_cost', 0)}") +``` + +## Additional Resources + +- [Azure Document Intelligence Documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/) +- [Pricing Details](https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/) +- [Supported File Formats](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview) +- [LiteLLM OCR Documentation](https://docs.litellm.ai/docs/ocr) + diff --git a/docs/my-website/docs/providers/azure_ocr.md b/docs/my-website/docs/providers/azure_ocr.md index c93e995c43..5d79cc0533 100644 --- a/docs/my-website/docs/providers/azure_ocr.md +++ b/docs/my-website/docs/providers/azure_ocr.md @@ -1,4 +1,4 @@ -# Azure AI OCR +# Azure AI OCR (Mistral) ## Overview diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 84c1067824..ccba0bdb7c 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -455,6 +455,7 @@ const sidebars = { items: [ "providers/azure_ai", "providers/azure_ocr", + "providers/azure_document_intelligence", "providers/azure_ai_speech", "providers/azure_ai_img", "providers/azure_ai_vector_stores", diff --git a/litellm/constants.py b/litellm/constants.py index 372c838ce3..64630d5cb0 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -209,6 +209,12 @@ DEFAULT_POLLING_INTERVAL = float( os.getenv("DEFAULT_POLLING_INTERVAL", 0.03) ) # default polling interval for the scheduler AZURE_OPERATION_POLLING_TIMEOUT = int(os.getenv("AZURE_OPERATION_POLLING_TIMEOUT", 120)) +AZURE_DOCUMENT_INTELLIGENCE_API_VERSION = str( + os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_VERSION", "2024-11-30") +) +AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI = int( + os.getenv("AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI", 96) +) REDIS_SOCKET_TIMEOUT = float(os.getenv("REDIS_SOCKET_TIMEOUT", 0.1)) REDIS_CONNECTION_POOL_TIMEOUT = int(os.getenv("REDIS_CONNECTION_POOL_TIMEOUT", 5)) NON_LLM_CONNECTION_TIMEOUT = int( diff --git a/litellm/llms/azure_ai/ocr/__init__.py b/litellm/llms/azure_ai/ocr/__init__.py index 86f7e53d60..7182a750b4 100644 --- a/litellm/llms/azure_ai/ocr/__init__.py +++ b/litellm/llms/azure_ai/ocr/__init__.py @@ -1,5 +1,13 @@ """Azure AI OCR module.""" +from .common_utils import get_azure_ai_ocr_config +from .document_intelligence.transformation import ( + AzureDocumentIntelligenceOCRConfig, +) from .transformation import AzureAIOCRConfig -__all__ = ["AzureAIOCRConfig"] +__all__ = [ + "AzureAIOCRConfig", + "AzureDocumentIntelligenceOCRConfig", + "get_azure_ai_ocr_config", +] diff --git a/litellm/llms/azure_ai/ocr/common_utils.py b/litellm/llms/azure_ai/ocr/common_utils.py new file mode 100644 index 0000000000..ef470c7492 --- /dev/null +++ b/litellm/llms/azure_ai/ocr/common_utils.py @@ -0,0 +1,53 @@ +""" +Common utilities for Azure AI OCR providers. + +This module provides routing logic to determine which OCR configuration to use +based on the model name. +""" + +from typing import TYPE_CHECKING, Optional + +from litellm._logging import verbose_logger + +if TYPE_CHECKING: + from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig + + +def get_azure_ai_ocr_config(model: str) -> Optional["BaseOCRConfig"]: + """ + Determine which Azure AI OCR configuration to use based on the model name. + + Azure AI supports multiple OCR services: + - Azure Document Intelligence: azure_ai/doc-intelligence/ + - Mistral OCR (via Azure AI): azure_ai/ + + Args: + model: The model name (e.g., "azure_ai/doc-intelligence/prebuilt-read", + "azure_ai/pixtral-12b-2409") + + Returns: + OCR configuration instance for the specified model + + Examples: + >>> get_azure_ai_ocr_config("azure_ai/doc-intelligence/prebuilt-read") + + + >>> get_azure_ai_ocr_config("azure_ai/pixtral-12b-2409") + + """ + from litellm.llms.azure_ai.ocr.document_intelligence.transformation import ( + AzureDocumentIntelligenceOCRConfig, + ) + from litellm.llms.azure_ai.ocr.transformation import AzureAIOCRConfig + + # Check for Azure Document Intelligence models + if "doc-intelligence" in model or "documentintelligence" in model: + verbose_logger.debug( + f"Routing {model} to Azure Document Intelligence OCR config" + ) + return AzureDocumentIntelligenceOCRConfig() + + # Default to Mistral-based OCR for other azure_ai models + verbose_logger.debug(f"Routing {model} to Azure AI (Mistral) OCR config") + return AzureAIOCRConfig() + diff --git a/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py b/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py new file mode 100644 index 0000000000..372a6a8d76 --- /dev/null +++ b/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py @@ -0,0 +1,5 @@ +"""Azure Document Intelligence OCR module.""" +from .transformation import AzureDocumentIntelligenceOCRConfig + +__all__ = ["AzureDocumentIntelligenceOCRConfig"] + diff --git a/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py b/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py new file mode 100644 index 0000000000..a1386c1a37 --- /dev/null +++ b/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py @@ -0,0 +1,697 @@ +""" +Azure Document Intelligence OCR transformation implementation. + +Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities. +This implementation transforms between Mistral OCR format and Azure Document Intelligence API v4.0. + +Note: Azure Document Intelligence API is async - POST returns 202 Accepted with Operation-Location header. +The operation location must be polled until the analysis completes. +""" +import asyncio +import re +import time +from typing import Any, Dict, Optional + +import httpx + +from litellm._logging import verbose_logger +from litellm.constants import ( + AZURE_DOCUMENT_INTELLIGENCE_API_VERSION, + AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI, + AZURE_OPERATION_POLLING_TIMEOUT, +) +from litellm.llms.base_llm.ocr.transformation import ( + BaseOCRConfig, + DocumentType, + OCRPage, + OCRPageDimensions, + OCRRequestData, + OCRResponse, + OCRUsageInfo, +) +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.secret_managers.main import get_secret_str + + +class AzureDocumentIntelligenceOCRConfig(BaseOCRConfig): + """ + Azure Document Intelligence OCR transformation configuration. + + Supports Azure Document Intelligence v4.0 (2024-11-30) API. + Model route: azure_ai/doc-intelligence/ + + Supported models: + - prebuilt-layout: Extracts text with markdown, tables, and structure (closest to Mistral OCR) + - prebuilt-read: Basic text extraction optimized for reading + - prebuilt-document: General document analysis + + Reference: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/ + """ + + def __init__(self) -> None: + super().__init__() + + def get_supported_ocr_params(self, model: str) -> list: + """ + Get supported OCR parameters for Azure Document Intelligence. + + Azure DI has minimal optional parameters compared to Mistral OCR. + Most Mistral-specific params are ignored during transformation. + """ + return [] + + def validate_environment( + self, + headers: Dict, + model: str, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + litellm_params: Optional[dict] = None, + **kwargs, + ) -> Dict: + """ + Validate environment and return headers for Azure Document Intelligence. + + Authentication uses Ocp-Apim-Subscription-Key header. + """ + # Get API key from environment if not provided + if api_key is None: + api_key = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_API_KEY") + + if api_key is None: + raise ValueError( + "Missing Azure Document Intelligence API Key - Set AZURE_DOCUMENT_INTELLIGENCE_API_KEY environment variable or pass api_key parameter" + ) + + # Validate API base/endpoint is provided + if api_base is None: + api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT") + + if api_base is None: + raise ValueError( + "Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter" + ) + + headers = { + "Ocp-Apim-Subscription-Key": api_key, + "Content-Type": "application/json", + **headers, + } + + return headers + + def get_complete_url( + self, + api_base: Optional[str], + model: str, + optional_params: dict, + litellm_params: Optional[dict] = None, + **kwargs, + ) -> str: + """ + Get complete URL for Azure Document Intelligence endpoint. + + Format: {endpoint}/documentintelligence/documentModels/{modelId}:analyze?api-version=2024-11-30 + + Note: API version 2024-11-30 uses /documentintelligence/ path (not /formrecognizer/) + + Args: + api_base: Azure Document Intelligence endpoint (e.g., https://your-resource.cognitiveservices.azure.com) + model: Model ID (e.g., "prebuilt-layout", "prebuilt-read") + optional_params: Optional parameters + + Returns: Complete URL for Azure DI analyze endpoint + """ + if api_base is None: + raise ValueError( + "Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter" + ) + + # Ensure no trailing slash + api_base = api_base.rstrip("/") + + # Extract model ID from full model path if needed + # Model can be "prebuilt-layout" or "azure_ai/doc-intelligence/prebuilt-layout" + model_id = model + if "/" in model: + # Extract the last part after the last slash + model_id = model.split("/")[-1] + + # Azure Document Intelligence analyze endpoint + # Note: API version 2024-11-30+ uses /documentintelligence/ (not /formrecognizer/) + return f"{api_base}/documentintelligence/documentModels/{model_id}:analyze?api-version={AZURE_DOCUMENT_INTELLIGENCE_API_VERSION}" + + def _extract_base64_from_data_uri(self, data_uri: str) -> str: + """ + Extract base64 content from a data URI. + + Args: + data_uri: Data URI like "data:application/pdf;base64,..." + + Returns: + Base64 string without the data URI prefix + """ + # Match pattern: data:[][;base64], + match = re.match(r"data:([^;]+)(?:;base64)?,(.+)", data_uri) + if match: + return match.group(2) + return data_uri + + def transform_ocr_request( + self, + model: str, + document: DocumentType, + optional_params: dict, + headers: dict, + **kwargs, + ) -> OCRRequestData: + """ + Transform OCR request to Azure Document Intelligence format. + + Mistral OCR format: + { + "document": { + "type": "document_url", + "document_url": "https://example.com/doc.pdf" + } + } + + Azure DI format: + { + "urlSource": "https://example.com/doc.pdf" + } + OR + { + "base64Source": "base64_encoded_content" + } + + Args: + model: Model name + document: Document dict from user (Mistral format) + optional_params: Already mapped optional parameters + headers: Request headers + + Returns: + OCRRequestData with JSON data + """ + verbose_logger.debug( + f"Azure Document Intelligence transform_ocr_request - model: {model}" + ) + + if not isinstance(document, dict): + raise ValueError(f"Expected document dict, got {type(document)}") + + # Extract document URL from Mistral format + doc_type = document.get("type") + document_url = None + + if doc_type == "document_url": + document_url = document.get("document_url", "") + elif doc_type == "image_url": + document_url = document.get("image_url", "") + else: + raise ValueError( + f"Invalid document type: {doc_type}. Must be 'document_url' or 'image_url'" + ) + + if not document_url: + raise ValueError("Document URL is required") + + # Build Azure DI request + data: Dict[str, Any] = {} + + # Check if it's a data URI (base64) + if document_url.startswith("data:"): + # Extract base64 content + base64_content = self._extract_base64_from_data_uri(document_url) + data["base64Source"] = base64_content + verbose_logger.debug("Using base64Source for Azure Document Intelligence") + else: + # Regular URL + data["urlSource"] = document_url + verbose_logger.debug("Using urlSource for Azure Document Intelligence") + + # Azure DI doesn't support most Mistral-specific params + # Ignore pages, include_image_base64, etc. + + return OCRRequestData(data=data, files=None) + + def _extract_page_markdown(self, page_data: Dict[str, Any]) -> str: + """ + Extract text from Azure DI page and format as markdown. + + Azure DI provides text in 'lines' array. We concatenate them with newlines. + + Args: + page_data: Azure DI page object + + Returns: + Markdown-formatted text + """ + lines = page_data.get("lines", []) + if not lines: + return "" + + # Extract text content from each line + text_lines = [line.get("content", "") for line in lines] + + # Join with newlines to preserve structure + return "\n".join(text_lines) + + def _convert_dimensions( + self, width: float, height: float, unit: str + ) -> OCRPageDimensions: + """ + Convert Azure DI dimensions to pixels. + + Azure DI provides dimensions in inches. We convert to pixels using configured DPI. + + Args: + width: Width in specified unit + height: Height in specified unit + unit: Unit of measurement (e.g., "inch") + + Returns: + OCRPageDimensions with pixel values + """ + # Convert to pixels using configured DPI + dpi = AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI + if unit == "inch": + width_px = int(width * dpi) + height_px = int(height * dpi) + else: + # If unit is not inches, assume it's already in pixels + width_px = int(width) + height_px = int(height) + + return OCRPageDimensions(width=width_px, height=height_px, dpi=dpi) + + @staticmethod + def _check_timeout(start_time: float, timeout_secs: int) -> None: + """ + Check if operation has timed out. + + Args: + start_time: Start time of the operation + timeout_secs: Timeout duration in seconds + + Raises: + TimeoutError: If operation has exceeded timeout + """ + if time.time() - start_time > timeout_secs: + raise TimeoutError( + f"Azure Document Intelligence operation polling timed out after {timeout_secs} seconds" + ) + + @staticmethod + def _get_retry_after(response: httpx.Response) -> int: + """ + Get retry-after duration from response headers. + + Args: + response: HTTP response + + Returns: + Retry-after duration in seconds (default: 2) + """ + retry_after = int(response.headers.get("retry-after", "2")) + verbose_logger.debug(f"Retry polling after: {retry_after} seconds") + return retry_after + + @staticmethod + def _check_operation_status(response: httpx.Response) -> str: + """ + Check Azure DI operation status from response. + + Args: + response: HTTP response from operation endpoint + + Returns: + Operation status string + + Raises: + ValueError: If operation failed or status is unknown + """ + try: + result = response.json() + status = result.get("status") + + verbose_logger.debug(f"Azure DI operation status: {status}") + + if status == "succeeded": + return "succeeded" + elif status == "failed": + error_msg = result.get("error", {}).get("message", "Unknown error") + raise ValueError( + f"Azure Document Intelligence analysis failed: {error_msg}" + ) + elif status in ["running", "notStarted"]: + return "running" + else: + raise ValueError(f"Unknown operation status: {status}") + + except Exception as e: + if "succeeded" in str(e) or "failed" in str(e): + raise + # If we can't parse JSON, something went wrong + raise ValueError(f"Failed to parse Azure DI operation response: {e}") + + def _poll_operation_sync( + self, + operation_url: str, + headers: Dict[str, str], + timeout_secs: int, + ) -> httpx.Response: + """ + Poll Azure Document Intelligence operation until completion (sync). + + Azure DI POST returns 202 with Operation-Location header. + We need to poll that URL until status is "succeeded" or "failed". + + Args: + operation_url: The Operation-Location URL to poll + headers: Request headers (including auth) + timeout_secs: Total timeout in seconds + + Returns: + Final response with completed analysis + """ + from litellm.llms.custom_httpx.http_handler import _get_httpx_client + + client = _get_httpx_client() + start_time = time.time() + + verbose_logger.debug(f"Polling Azure DI operation: {operation_url}") + + while True: + self._check_timeout(start_time=start_time, timeout_secs=timeout_secs) + + # Poll the operation status + response = client.get(url=operation_url, headers=headers) + + # Check operation status + status = self._check_operation_status(response=response) + + if status == "succeeded": + return response + elif status == "running": + # Wait before polling again + retry_after = self._get_retry_after(response=response) + time.sleep(retry_after) + + async def _poll_operation_async( + self, + operation_url: str, + headers: Dict[str, str], + timeout_secs: int, + ) -> httpx.Response: + """ + Poll Azure Document Intelligence operation until completion (async). + + Args: + operation_url: The Operation-Location URL to poll + headers: Request headers (including auth) + timeout_secs: Total timeout in seconds + + Returns: + Final response with completed analysis + """ + import litellm + from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + client = get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE_AI) + start_time = time.time() + + verbose_logger.debug(f"Polling Azure DI operation (async): {operation_url}") + + while True: + self._check_timeout(start_time=start_time, timeout_secs=timeout_secs) + + # Poll the operation status + response = await client.get(url=operation_url, headers=headers) + + # Check operation status + status = self._check_operation_status(response=response) + + if status == "succeeded": + return response + elif status == "running": + # Wait before polling again + retry_after = self._get_retry_after(response=response) + await asyncio.sleep(retry_after) + + def transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: Any, + **kwargs, + ) -> OCRResponse: + """ + Transform Azure Document Intelligence response to Mistral OCR format. + + Handles async operation polling: If response is 202 Accepted, polls Operation-Location + until analysis completes. + + Azure DI response (after polling): + { + "status": "succeeded", + "analyzeResult": { + "content": "Full document text...", + "pages": [ + { + "pageNumber": 1, + "width": 8.5, + "height": 11, + "unit": "inch", + "lines": [{"content": "text", "boundingBox": [...]}] + } + ] + } + } + + Mistral OCR format: + { + "pages": [ + { + "index": 0, + "markdown": "extracted text", + "dimensions": {"width": 816, "height": 1056, "dpi": 96} + } + ], + "model": "azure_ai/doc-intelligence/prebuilt-layout", + "usage_info": {"pages_processed": 1}, + "object": "ocr" + } + + Args: + model: Model name + raw_response: Raw HTTP response from Azure DI (may be 202 Accepted) + logging_obj: Logging object + + Returns: + OCRResponse in Mistral format + """ + try: + # Check if we got 202 Accepted (async operation started) + if raw_response.status_code == 202: + verbose_logger.debug( + "Azure DI returned 202 Accepted, polling operation..." + ) + + # Get Operation-Location header + operation_url = raw_response.headers.get("Operation-Location") + if not operation_url: + raise ValueError( + "Azure Document Intelligence returned 202 but no Operation-Location header found" + ) + + # Get headers for polling (need auth) + poll_headers = { + "Ocp-Apim-Subscription-Key": raw_response.request.headers.get( + "Ocp-Apim-Subscription-Key", "" + ) + } + + # Get timeout from kwargs or use default + timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT + + # Poll until operation completes + raw_response = self._poll_operation_sync( + operation_url=operation_url, + headers=poll_headers, + timeout_secs=timeout_secs, + ) + + # Now parse the completed response + response_json = raw_response.json() + + verbose_logger.debug( + f"Azure Document Intelligence response status: {response_json.get('status')}" + ) + + # Check if request succeeded + status = response_json.get("status") + if status != "succeeded": + raise ValueError( + f"Azure Document Intelligence analysis failed with status: {status}" + ) + + # Extract analyze result + analyze_result = response_json.get("analyzeResult", {}) + azure_pages = analyze_result.get("pages", []) + + # Transform pages to Mistral format + mistral_pages = [] + for azure_page in azure_pages: + page_number = azure_page.get("pageNumber", 1) + index = page_number - 1 # Convert to 0-based index + + # Extract markdown text + markdown = self._extract_page_markdown(azure_page) + + # Convert dimensions + width = azure_page.get("width", 8.5) + height = azure_page.get("height", 11) + unit = azure_page.get("unit", "inch") + dimensions = self._convert_dimensions( + width=width, height=height, unit=unit + ) + + # Build OCR page + ocr_page = OCRPage( + index=index, markdown=markdown, dimensions=dimensions + ) + mistral_pages.append(ocr_page) + + # Build usage info + usage_info = OCRUsageInfo( + pages_processed=len(mistral_pages), doc_size_bytes=None + ) + + # Return Mistral OCR response + return OCRResponse( + pages=mistral_pages, + model=model, + usage_info=usage_info, + object="ocr", + ) + + except Exception as e: + verbose_logger.error( + f"Error parsing Azure Document Intelligence response: {e}" + ) + raise e + + async def async_transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: Any, + **kwargs, + ) -> OCRResponse: + """ + Async transform Azure Document Intelligence response to Mistral OCR format. + + Handles async operation polling: If response is 202 Accepted, polls Operation-Location + until analysis completes using async polling. + + Args: + model: Model name + raw_response: Raw HTTP response from Azure DI (may be 202 Accepted) + logging_obj: Logging object + + Returns: + OCRResponse in Mistral format + """ + try: + # Check if we got 202 Accepted (async operation started) + if raw_response.status_code == 202: + verbose_logger.debug( + "Azure DI returned 202 Accepted, polling operation (async)..." + ) + + # Get Operation-Location header + operation_url = raw_response.headers.get("Operation-Location") + if not operation_url: + raise ValueError( + "Azure Document Intelligence returned 202 but no Operation-Location header found" + ) + + # Get headers for polling (need auth) + poll_headers = { + "Ocp-Apim-Subscription-Key": raw_response.request.headers.get( + "Ocp-Apim-Subscription-Key", "" + ) + } + + # Get timeout from kwargs or use default + timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT + + # Poll until operation completes (async) + raw_response = await self._poll_operation_async( + operation_url=operation_url, + headers=poll_headers, + timeout_secs=timeout_secs, + ) + + # Now parse the completed response + response_json = raw_response.json() + + verbose_logger.debug( + f"Azure Document Intelligence response status: {response_json.get('status')}" + ) + + # Check if request succeeded + status = response_json.get("status") + if status != "succeeded": + raise ValueError( + f"Azure Document Intelligence analysis failed with status: {status}" + ) + + # Extract analyze result + analyze_result = response_json.get("analyzeResult", {}) + azure_pages = analyze_result.get("pages", []) + + # Transform pages to Mistral format + mistral_pages = [] + for azure_page in azure_pages: + page_number = azure_page.get("pageNumber", 1) + index = page_number - 1 # Convert to 0-based index + + # Extract markdown text + markdown = self._extract_page_markdown(azure_page) + + # Convert dimensions + width = azure_page.get("width", 8.5) + height = azure_page.get("height", 11) + unit = azure_page.get("unit", "inch") + dimensions = self._convert_dimensions( + width=width, height=height, unit=unit + ) + + # Build OCR page + ocr_page = OCRPage( + index=index, markdown=markdown, dimensions=dimensions + ) + mistral_pages.append(ocr_page) + + # Build usage info + usage_info = OCRUsageInfo( + pages_processed=len(mistral_pages), doc_size_bytes=None + ) + + # Return Mistral OCR response + return OCRResponse( + pages=mistral_pages, + model=model, + usage_info=usage_info, + object="ocr", + ) + + except Exception as e: + verbose_logger.error( + f"Error parsing Azure Document Intelligence response (async): {e}" + ) + raise e + diff --git a/litellm/llms/base_llm/ocr/transformation.py b/litellm/llms/base_llm/ocr/transformation.py index c88783f97f..fb13332c46 100644 --- a/litellm/llms/base_llm/ocr/transformation.py +++ b/litellm/llms/base_llm/ocr/transformation.py @@ -198,6 +198,36 @@ class BaseOCRConfig: """ raise NotImplementedError("transform_ocr_response must be implemented by provider") + async def async_transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + **kwargs, + ) -> OCRResponse: + """ + Async transform provider-specific OCR response to standard format. + Optional method - providers can override if they need async transformations + (e.g., Azure Document Intelligence for async operation polling). + + Default implementation falls back to sync transform_ocr_response. + + Args: + model: Model name + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + OCRResponse in standard format + """ + # Default implementation: call sync version + return self.transform_ocr_response( + model=model, + raw_response=raw_response, + logging_obj=logging_obj, + **kwargs, + ) + def get_error_class( self, error_message: str, diff --git a/litellm/llms/custom_httpx/llm_http_handler.py b/litellm/llms/custom_httpx/llm_http_handler.py index 47715f7bc5..1cd6333532 100644 --- a/litellm/llms/custom_httpx/llm_http_handler.py +++ b/litellm/llms/custom_httpx/llm_http_handler.py @@ -1552,10 +1552,10 @@ class BaseLLMHTTPHandler: except Exception as e: raise self._handle_error(e=e, provider_config=provider_config) - return self._transform_ocr_response( - provider_config=provider_config, + # Use async response transform for async operations + return await provider_config.async_transform_ocr_response( model=model, - response=response, + raw_response=response, logging_obj=logging_obj, ) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 86af6995c7..b1e672ca1b 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -3387,6 +3387,33 @@ ], "source": "https://devblogs.microsoft.com/foundry/whats-new-in-azure-ai-foundry-august-2025/#mistral-document-ai-(ocr)-%E2%80%94-serverless-in-foundry" }, + "azure_ai/doc-intelligence/prebuilt-read": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1.5e-3, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, + "azure_ai/doc-intelligence/prebuilt-layout": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1e-2, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, + "azure_ai/doc-intelligence/prebuilt-document": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1e-2, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, "azure_ai/MAI-DS-R1": { "input_cost_per_token": 1.35e-06, "litellm_provider": "azure_ai", diff --git a/litellm/utils.py b/litellm/utils.py index c18ecc4378..8c03e9d505 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7741,12 +7741,16 @@ class ProviderConfigManager: """ Get OCR configuration for a given provider. """ - from litellm.llms.azure_ai.ocr.transformation import AzureAIOCRConfig from litellm.llms.vertex_ai.ocr.transformation import VertexAIOCRConfig + # Special handling for Azure AI - distinguish between Mistral OCR and Document Intelligence + if provider == litellm.LlmProviders.AZURE_AI: + from litellm.llms.azure_ai.ocr.common_utils import get_azure_ai_ocr_config + + return get_azure_ai_ocr_config(model=model) + PROVIDER_TO_CONFIG_MAP = { litellm.LlmProviders.MISTRAL: MistralOCRConfig, - litellm.LlmProviders.AZURE_AI: AzureAIOCRConfig, litellm.LlmProviders.VERTEX_AI: VertexAIOCRConfig, } config_class = PROVIDER_TO_CONFIG_MAP.get(provider, None) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 86af6995c7..b1e672ca1b 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -3387,6 +3387,33 @@ ], "source": "https://devblogs.microsoft.com/foundry/whats-new-in-azure-ai-foundry-august-2025/#mistral-document-ai-(ocr)-%E2%80%94-serverless-in-foundry" }, + "azure_ai/doc-intelligence/prebuilt-read": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1.5e-3, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, + "azure_ai/doc-intelligence/prebuilt-layout": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1e-2, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, + "azure_ai/doc-intelligence/prebuilt-document": { + "litellm_provider": "azure_ai", + "ocr_cost_per_page": 1e-2, + "mode": "ocr", + "supported_endpoints": [ + "/v1/ocr" + ], + "source": "https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/" + }, "azure_ai/MAI-DS-R1": { "input_cost_per_token": 1.35e-06, "litellm_provider": "azure_ai", diff --git a/provider_endpoints_support.json b/provider_endpoints_support.json index a1a3f5a282..e1aaa2d2ec 100644 --- a/provider_endpoints_support.json +++ b/provider_endpoints_support.json @@ -197,6 +197,23 @@ "ocr": true } }, + "azure_ai/doc-intelligence": { + "display_name": "Azure AI Document Intelligence (`azure_ai/doc-intelligence`)", + "url": "https://docs.litellm.ai/docs/providers/azure_document_intelligence", + "endpoints": { + "chat_completions": false, + "messages": false, + "responses": false, + "embeddings": false, + "image_generations": false, + "audio_transcriptions": false, + "audio_speech": false, + "moderations": false, + "batches": false, + "rerank": false, + "ocr": true + } + }, "azure_text": { "display_name": "Azure Text (`azure_text`)", "url": "https://docs.litellm.ai/docs/providers/azure", diff --git a/tests/ocr_tests/test_ocr_azure_document_intelligence.py b/tests/ocr_tests/test_ocr_azure_document_intelligence.py new file mode 100644 index 0000000000..9c1c9e134d --- /dev/null +++ b/tests/ocr_tests/test_ocr_azure_document_intelligence.py @@ -0,0 +1,44 @@ +""" +Test OCR functionality with Azure Document Intelligence API. + +Azure Document Intelligence provides advanced document analysis capabilities +using the v4.0 (2024-11-30) API. +""" +import os + +import pytest + +from base_ocr_unit_tests import BaseOCRTest + + +class TestAzureDocumentIntelligenceOCR(BaseOCRTest): + """ + Test class for Azure Document Intelligence OCR functionality. + + Inherits from BaseOCRTest and provides Azure Document Intelligence-specific configuration. + + Tests the azure_ai/doc-intelligence/ provider route. + """ + + def get_base_ocr_call_args(self) -> dict: + """ + Return the base OCR call args for Azure Document Intelligence. + + Uses prebuilt-layout model which is closest to Mistral OCR format. + """ + # Check for required environment variables + api_key = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_API_KEY") + endpoint = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT") + + if not api_key or not endpoint: + pytest.skip( + "AZURE_DOCUMENT_INTELLIGENCE_API_KEY and AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT " + "environment variables are required for Azure Document Intelligence tests" + ) + + return { + "model": "azure_ai/doc-intelligence/prebuilt-layout", + "api_key": api_key, + "api_base": endpoint, + } +