Files
litellm/tests/test_litellm/ocr/test_ocr_file_input.py
T
Noah Nistler d13508c1c5 Enable local file support for OCR (#22133)
* [Docs] Enable local file support

Implemented internal handling for converting file-type documents to the required format for OCR processing, ensuring seamless integration with various providers.

* Refactor OCR file handling and improve security checks

Removed deprecated MIME type mapping and file conversion functions, replacing them with updated implementations. Enhanced security by rejecting 'file' document types in JSON requests, ensuring file uploads are handled via multipart/form-data. Updated tests to reflect these changes and ensure proper functionality.

* Enhance MIME type validation in OCR processing

Added a regular expression check to validate MIME types in the convert_file_document_to_url_document function, raising a ValueError for invalid types. Updated tests to ensure proper error handling for unsupported MIME types.

* Enhance type safety in OCR file handling

Added type casting for the uploaded file in the _parse_multipart_form function to ensure proper handling of UploadFile instances. This change improves type safety and reduces potential runtime errors during file processing.

* Refactor MIME type handling in document uploads

Updated the MIME type extraction logic to strip parameters from the Content-Type header, ensuring only the base type is used. Added tests to verify that MIME parameters are correctly handled and stripped in various scenarios.

* Update OCR documentation for MIME type recommendations and remove unnecessary tips

Clarified the recommended usage of MIME types for raw bytes in document uploads. Simplified the documentation by removing the tip about multipart file uploads from tools like Postman, ensuring a more concise and focused guide.

* Enhance multipart form handling in OCR endpoints

Updated the _parse_multipart_form function to ignore both 'file' and 'document' fields during form parsing, ensuring that the document built from the uploaded file is not overridden. Added a new test to verify that injected document fields do not affect the constructed document, improving security and robustness of the file upload process.
2026-02-27 10:50:02 -08:00

465 lines
17 KiB
Python

"""
Tests for OCR file input support.
Tests that:
1. The SDK document parameter with type="file" correctly converts file paths,
file objects, and raw bytes to base64 data URIs before sending to providers.
2. The proxy _build_document_from_upload helper correctly handles uploaded file bytes.
3. The proxy rejects type="file" documents received via JSON (security guard).
4. The proxy returns user-friendly errors for invalid JSON bodies.
"""
import base64
import os
import tempfile
from io import BytesIO
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock
import orjson
import pytest
from litellm.ocr.main import convert_file_document_to_url_document, get_mime_type
class TestGetMimeType:
def test_should_detect_pdf_mime_type(self):
assert get_mime_type("document.pdf") == "application/pdf"
def test_should_detect_png_mime_type(self):
assert get_mime_type("image.png") == "image/png"
def test_should_detect_jpg_mime_type(self):
assert get_mime_type("photo.jpg") == "image/jpeg"
def test_should_detect_jpeg_mime_type(self):
assert get_mime_type("photo.jpeg") == "image/jpeg"
def test_should_detect_gif_mime_type(self):
assert get_mime_type("animation.gif") == "image/gif"
def test_should_detect_webp_mime_type(self):
assert get_mime_type("image.webp") == "image/webp"
def test_should_detect_tiff_mime_type(self):
assert get_mime_type("scan.tiff") == "image/tiff"
def test_should_detect_tif_mime_type(self):
assert get_mime_type("scan.tif") == "image/tiff"
def test_should_detect_bmp_mime_type(self):
assert get_mime_type("bitmap.bmp") == "image/bmp"
def test_should_be_case_insensitive(self):
assert get_mime_type("DOCUMENT.PDF") == "application/pdf"
assert get_mime_type("IMAGE.PNG") == "image/png"
def test_should_fallback_for_unknown_extension(self):
result = get_mime_type("file.xyz123")
assert isinstance(result, str)
class TestConvertFileDocumentToUrlDocument:
def test_should_convert_pdf_file_path_to_document_url(self):
"""File path to a PDF should produce type=document_url with base64 data URI."""
pdf_content = b"%PDF-1.4 test content"
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
f.write(pdf_content)
f.flush()
tmp_path = f.name
try:
result = convert_file_document_to_url_document(
{"type": "file", "file": tmp_path}
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
b64_data = result["document_url"].split(";base64,")[1]
assert base64.b64decode(b64_data) == pdf_content
finally:
os.unlink(tmp_path)
def test_should_convert_image_file_path_to_image_url(self):
"""File path to a PNG image should produce type=image_url with base64 data URI."""
png_content = b"\x89PNG\r\n\x1a\n fake png content"
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
f.write(png_content)
f.flush()
tmp_path = f.name
try:
result = convert_file_document_to_url_document(
{"type": "file", "file": tmp_path}
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
b64_data = result["image_url"].split(";base64,")[1]
assert base64.b64decode(b64_data) == png_content
finally:
os.unlink(tmp_path)
def test_should_convert_pathlib_path(self):
"""pathlib.Path objects should work the same as string paths."""
content = b"test pdf content"
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
f.write(content)
f.flush()
tmp_path = Path(f.name)
try:
result = convert_file_document_to_url_document(
{"type": "file", "file": tmp_path}
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
finally:
os.unlink(str(tmp_path))
def test_should_convert_raw_bytes(self):
"""Raw bytes should be converted using a fallback MIME type."""
content = b"raw bytes content"
result = convert_file_document_to_url_document(
{"type": "file", "file": content}
)
assert result["type"] == "document_url"
assert "base64," in result["document_url"]
b64_data = result["document_url"].split(";base64,")[1]
assert base64.b64decode(b64_data) == content
def test_should_convert_raw_bytes_with_explicit_mime_type(self):
"""Raw bytes with explicit mime_type should use the specified MIME type."""
content = b"raw pdf content"
result = convert_file_document_to_url_document(
{"type": "file", "file": content, "mime_type": "application/pdf"}
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
def test_should_convert_raw_bytes_with_image_mime_type(self):
"""Raw bytes with an image MIME type should produce type=image_url."""
content = b"raw image content"
result = convert_file_document_to_url_document(
{"type": "file", "file": content, "mime_type": "image/jpeg"}
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/jpeg;base64,")
def test_should_convert_file_like_object(self):
"""BytesIO and other file-like objects should be supported."""
content = b"file-like content"
file_obj = BytesIO(content)
result = convert_file_document_to_url_document(
{"type": "file", "file": file_obj}
)
assert result["type"] == "document_url"
assert "base64," in result["document_url"]
def test_should_convert_file_like_object_with_name(self):
"""File-like objects with a .name attribute should detect MIME from the name."""
content = b"file-like png content"
file_obj = BytesIO(content)
file_obj.name = "test_image.png"
result = convert_file_document_to_url_document(
{"type": "file", "file": file_obj}
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
def test_should_raise_error_for_missing_file_field(self):
"""Missing 'file' field should raise ValueError."""
with pytest.raises(ValueError, match="must include a 'file' field"):
convert_file_document_to_url_document({"type": "file"})
def test_should_raise_error_for_nonexistent_file_path(self):
"""Non-existent file path should raise FileNotFoundError."""
with pytest.raises(FileNotFoundError, match="File not found"):
convert_file_document_to_url_document(
{"type": "file", "file": "/nonexistent/path/to/file.pdf"}
)
def test_should_raise_error_for_empty_file(self):
"""Empty file should raise ValueError."""
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
tmp_path = f.name
try:
with pytest.raises(ValueError, match="File is empty"):
convert_file_document_to_url_document(
{"type": "file", "file": tmp_path}
)
finally:
os.unlink(tmp_path)
def test_should_raise_error_for_unsupported_type(self):
"""Unsupported file input types should raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file input type"):
convert_file_document_to_url_document({"type": "file", "file": 12345})
def test_should_raise_error_for_invalid_mime_type(self):
"""MIME types with special characters should be rejected."""
content = b"some content"
with pytest.raises(ValueError, match="Invalid MIME type"):
convert_file_document_to_url_document(
{"type": "file", "file": content, "mime_type": "text/html; charset=utf-8\nX-Injected: true"}
)
def test_should_override_mime_type_for_file_path(self):
"""Explicit mime_type should override auto-detection from extension."""
content = b"some content"
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
f.write(content)
f.flush()
tmp_path = f.name
try:
result = convert_file_document_to_url_document(
{"type": "file", "file": tmp_path, "mime_type": "image/png"}
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
finally:
os.unlink(tmp_path)
class TestBuildDocumentFromUpload:
"""Test the proxy endpoint's file upload to document conversion helper."""
@pytest.fixture(autouse=True)
def _import_helper(self):
"""Import the proxy helper, skip if proxy deps aren't installed."""
try:
from litellm.proxy.ocr_endpoints.endpoints import (
_build_document_from_upload,
)
self._build = _build_document_from_upload
except ImportError:
pytest.skip("Proxy dependencies (fastapi/orjson) not installed")
def test_should_build_document_url_for_pdf(self):
content = b"%PDF-1.4 test content"
result = self._build(
file_content=content,
filename="document.pdf",
content_type="application/pdf",
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
b64_data = result["document_url"].split(";base64,")[1]
assert base64.b64decode(b64_data) == content
def test_should_build_image_url_for_png(self):
content = b"\x89PNG fake png"
result = self._build(
file_content=content,
filename="screenshot.png",
content_type="image/png",
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
def test_should_build_image_url_for_jpeg(self):
content = b"\xff\xd8\xff fake jpeg"
result = self._build(
file_content=content,
filename="photo.jpg",
content_type="image/jpeg",
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/jpeg;base64,")
def test_should_detect_mime_from_filename_when_content_type_is_octet_stream(self):
content = b"pdf content"
result = self._build(
file_content=content,
filename="report.pdf",
content_type="application/octet-stream",
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
def test_should_detect_mime_from_filename_when_content_type_is_none(self):
content = b"png content"
result = self._build(
file_content=content,
filename="image.png",
content_type=None,
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
def test_should_fallback_to_octet_stream_for_unknown(self):
content = b"unknown content"
result = self._build(
file_content=content,
filename=None,
content_type=None,
)
assert result["type"] == "document_url"
assert "application/octet-stream" in result["document_url"]
def test_should_preserve_base64_content_correctly(self):
content = b"Hello, World! \x00\x01\x02\xff"
result = self._build(
file_content=content,
filename="test.pdf",
content_type="application/pdf",
)
b64_data = result["document_url"].split(";base64,")[1]
assert base64.b64decode(b64_data) == content
def test_should_strip_mime_parameters_from_content_type(self):
"""Content-Type with parameters (e.g. charset) should be stripped to the base MIME type."""
content = b"%PDF-1.4 test"
result = self._build(
file_content=content,
filename="doc.pdf",
content_type="application/pdf; charset=utf-8",
)
assert result["type"] == "document_url"
assert result["document_url"].startswith("data:application/pdf;base64,")
def test_should_strip_mime_parameters_with_multiple_params(self):
"""Content-Type with multiple parameters should still be stripped correctly."""
content = b"image data"
result = self._build(
file_content=content,
filename="img.png",
content_type="image/png; charset=utf-8; boundary=something",
)
assert result["type"] == "image_url"
assert result["image_url"].startswith("data:image/png;base64,")
class TestProxySecurityGuard:
"""Test that the proxy rejects type='file' documents in JSON requests
and that multipart form fields cannot override the constructed document."""
@pytest.fixture(autouse=True)
def _import_helpers(self):
"""Import the proxy helpers, skip if proxy deps aren't installed."""
try:
from litellm.proxy.ocr_endpoints.endpoints import (
_parse_multipart_form,
_parse_ocr_request,
)
self._parse = _parse_ocr_request
self._parse_multipart = _parse_multipart_form
except ImportError:
pytest.skip("Proxy dependencies (fastapi/orjson) not installed")
@pytest.mark.asyncio
async def test_should_reject_file_type_document_in_json_body(self):
"""type='file' in a JSON body must be rejected to prevent server-side file reads."""
body = orjson.dumps(
{
"model": "mistral/mistral-ocr-latest",
"document": {"type": "file", "file": "/etc/passwd"},
}
)
mock_request = MagicMock()
mock_request.headers = {"content-type": "application/json"}
mock_request.body = AsyncMock(return_value=body)
mock_request._form = None
with pytest.raises(ValueError, match="not supported through the JSON API"):
await self._parse(mock_request)
@pytest.mark.asyncio
async def test_should_accept_document_url_type_in_json_body(self):
"""type='document_url' in a JSON body should pass through normally."""
expected = {
"model": "mistral/mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": "https://example.com/doc.pdf",
},
}
body = orjson.dumps(expected)
mock_request = MagicMock()
mock_request.headers = {"content-type": "application/json"}
mock_request.body = AsyncMock(return_value=body)
mock_request._form = None
result = await self._parse(mock_request)
assert result["document"]["type"] == "document_url"
@pytest.mark.asyncio
async def test_should_raise_on_invalid_json_body(self):
"""Invalid JSON should produce a user-friendly ValueError."""
mock_request = MagicMock()
mock_request.headers = {"content-type": "application/json"}
mock_request.body = AsyncMock(return_value=b"not valid json{{{")
mock_request._form = None
with pytest.raises(ValueError, match="Invalid JSON in request body"):
await self._parse(mock_request)
@pytest.mark.asyncio
async def test_should_ignore_document_form_field_injection(self):
"""A 'document' form field must not override the document built from the uploaded file."""
from starlette.datastructures import UploadFile
file_content = b"%PDF-1.4 legit content"
upload = UploadFile(filename="legit.pdf", file=BytesIO(file_content))
injected = '{"type": "file", "file": "/etc/passwd"}'
mock_form = {
"file": upload,
"model": "mistral/mistral-ocr-latest",
"document": injected,
}
mock_request = MagicMock()
mock_request.headers = {"content-type": "multipart/form-data; boundary=---"}
mock_request.form = AsyncMock(return_value=mock_form)
result = await self._parse_multipart(mock_request)
assert result["document"]["type"] == "document_url"
assert result["document"]["document_url"].startswith("data:application/pdf;base64,")
assert result["model"] == "mistral/mistral-ocr-latest"