diff --git a/litellm/translation/CLAUDE.md b/litellm/translation/CLAUDE.md index d1d1f2e11f..c29beeb581 100644 --- a/litellm/translation/CLAUDE.md +++ b/litellm/translation/CLAUDE.md @@ -148,6 +148,38 @@ translation/ │ │ └── claude.py # anthropic serializer/parsers re-exported; NO │ │ # response-format model spoof (v1 maps with the │ │ # real model); billing-header blocks fail closed +│ ├── compat_sdk/ # the wave-1a SDK-path openai-compat family in ONE +│ │ │ # subpackage (the google_genai "one family, +│ │ │ # parameterized" precedent): together_ai, +│ │ │ # cerebras, nvidia_nim, lm_studio, llamafile, +│ │ │ # lambda_ai, nebius, novita, wandb, +│ │ │ # featherless_ai, nscale, hyperbolic, volcengine. +│ │ │ # All ride v1's big openai elif into the SDK, so +│ │ │ # the body is openai_compat.assemble_body after +│ │ │ # per-provider gates; the response parser is +│ │ │ # openai_compat's verbatim (same live normalizer) +│ │ │ # and the {provider}/{wire_model} re-prefix is +│ │ │ # the SEAM's preset arm, never parser scope; +│ │ │ # streams are the "openai" chunk dialect (pinned +│ │ │ # per provider by wrapper replays). baseten is +│ │ │ # DELIBERATELY ABSENT: its streams ride a +│ │ │ # dedicated legacy wrapper branch +│ │ │ # (handle_baseten_chunk), so it stays a typed v1 +│ │ │ # fallback (canary-pinned). +│ │ ├── params.py # per-provider supported-list truths as pure gates +│ │ │ # (v1 RAISES-unless-drop_params on anything off +│ │ │ # the list); capability gates read deps over the +│ │ │ # LOAD-BEARING {provider}/{model} map keys +│ │ │ # (together_ai function calling, cerebras +│ │ │ # reasoning); nvidia_nim's static per-model table +│ │ ├── serialize.py # frozen CompatProfile per provider (mct rename / +│ │ │ # user emission / together's rf-text drop / +│ │ │ # cerebras reasoning_effort) -> gates -> +│ │ │ # openai_compat assemble_body -> deltas +│ │ └── guard.py # explicit stream:false (the SDK serializes the +│ │ # key; absent-vs-false is lost in the IR), then +│ │ # the shared openai guard with the full +│ │ # message-name fallback (nobody here strips names) │ └── xai/ # Grok over openai_compat (httpx path: NO model │ │ # prefix anywhere, transform_response is LIVE): │ ├── guard.py # web_search_options (v1's Responses-bridge reroute @@ -300,10 +332,13 @@ A behavior change ships as its own snapshot-diffed PR, never inside a port. ## Current scope -OpenAI-chat-in to eleven providers out — `anthropic`, `bedrock_converse`, -`bedrock_invoke`, `openai_compat`, `vertex_ai` (gemini route), `gemini` -(AI Studio), `vertex_anthropic`, `azure`, `azure_ai`, -`azure_ai_anthropic`, `xai` — request, response, and stream translation, +OpenAI-chat-in to twenty-four providers out — `anthropic`, +`bedrock_converse`, `bedrock_invoke`, `openai_compat`, `vertex_ai` (gemini +route), `gemini` (AI Studio), `vertex_anthropic`, `azure`, `azure_ai`, +`azure_ai_anthropic`, `xai`, and the thirteen wave-1a compat_sdk providers +(`together_ai`, `cerebras`, `nvidia_nim`, `lm_studio`, `llamafile`, +`lambda_ai`, `nebius`, `novita`, `wandb`, `featherless_ai`, `nscale`, +`hyperbolic`, `volcengine`) — request, response, and stream translation, differential-green (anthropic: 46-shape corpus + responses + stream replays; bedrock and google: the characterization corpus per route + quirk corpora; openai: 17-shape request corpus + 17 typed-fallback rows + @@ -314,7 +349,12 @@ azure_ai: the Foundry override-set and no-spoof Claude-route corpora; xai: a 21-shape generated characterization corpus — provenance v1 in-process at HEAD, zero recorded vendor fixtures exist — two-sided over `tests/test_litellm/translation/characterization_xai/` plus 7 rows pinning -v1's UnsupportedParamsError raises and the line-seam stream replays), +v1's UnsupportedParamsError raises and the line-seam stream replays; +compat_sdk: per-provider generated corpora vs v1 in-process at HEAD +(`test_differential_compat_sdk_{request,response,stream}.py` over +`_compat_sdk_corpus.py` — served rows, UnsupportedParamsError raise rows, +preset-model re-prefix response rows, per-provider wrapper stream replays, +and supported-list mirror drift gates over every model-map row)), fail-closed everywhere else, with non-streaming flag-gated seams live in `completion()` for the anthropic, bedrock, and google routes (the openai/azure seam forks are integrator scope and NOT wired; the google @@ -389,7 +429,39 @@ definitions with `strict` keys below the function level (v1 deletes every depth), the openai guard's raw shapes, and the parse-level unknowns v1 passes through for grok (presence/frequency penalties on supported families, seed, logprobs, top_logprobs, logit_bias, n, stream_options, -web_search_options). The xai completion() fork is NOT wired (integrator +web_search_options). +Deliberate compat_sdk (wave-1a) fallback surfaces (each names the v1 +path): every IR-carried param a provider's supported list excludes — v1's +`_check_valid_arg` raises UnsupportedParamsError or drops under +drop_params, so the typed fallback serves v1's own behavior +(max_completion_tokens on nscale/hyperbolic — both rename arms are dead +code behind the list gate; tools/tool_choice/response_format on +featherless_ai, on volcengine (response_format only), on together_ai +models without the supports_function_calling map flag, and on +nvidia_nim's reduced static-table models; parallel_tool_calls on +cerebras/featherless_ai/nscale/hyperbolic/volcengine; reasoning_effort +everywhere except capability-flagged cerebras models; response_format on +base-list providers when the model is literally named gpt-4 / +gpt-3.5-turbo-16k); `user` wherever a provider's own list doesn't carry +it (v1's base list gates it on openai model-list membership and silently +drops it otherwise — only cerebras and hyperbolic emit it); +volcengine `thinking` (v1 packs the verbatim dict into extra_body for the +SDK to merge top-level); lm_studio's bare-`schema` response_format wrap +(non-canonical inbound shape, parse rejects it); explicit `stream: false` +(the SDK serializes the key; absent-vs-false is lost in the IR); the +openai guard's raw shapes with the FULL message-name fallback (no config +here strips names); and the parse-level unknowns (seed, penalties, +logprobs, n, logit_bias, stream_options, ...) regardless of whether a +given provider's list serves or raises on them. The compat_sdk +completion() forks are NOT wired (integrator scope, like openai/azure/ +xai); when they land, the seam must pre-set +`ModelResponse(model=f"{provider}/{model}")` exactly like openai.py: +676-677 so `_to_model_response_openai`'s re-prefix arm reproduces +cdr:699-710 (pinned per provider by the preset-model differential rows), +and baseten must NEVER be added to the family without resolving its +dedicated legacy wrapper stream branch (handle_baseten_chunk — the +test_baseten_drop_canary evidence). +The xai completion() fork is NOT wired (integrator scope, like openai/azure); when it lands these are HARD OBLIGATIONS, not notes: the in-package `use_xai_oauth` guard arm is defense-in-depth ONLY and unreachable through `_raw_openai_body` (use_xai_oauth is a litellm diff --git a/litellm/translation/dispatch.py b/litellm/translation/dispatch.py index ecc15b72af..bbc0cc0542 100644 --- a/litellm/translation/dispatch.py +++ b/litellm/translation/dispatch.py @@ -32,6 +32,23 @@ Provider = Literal[ "gemini", "vertex_anthropic", "xai", + # wave-1a: the SDK-path openai-compat family (providers/compat_sdk). + # baseten is deliberately ABSENT: its streams ride a dedicated legacy + # CustomStreamWrapper branch (handle_baseten_chunk), not the openai + # dialect, so it stays a typed v1 fallback (wave1a-port.md). + "together_ai", + "cerebras", + "nvidia_nim", + "lm_studio", + "llamafile", + "lambda_ai", + "nebius", + "novita", + "wandb", + "featherless_ai", + "nscale", + "hyperbolic", + "volcengine", ] _SAME_FAMILY: frozenset[tuple[InboundSchema, Provider]] = frozenset( @@ -44,6 +61,10 @@ _SAME_FAMILY: frozenset[tuple[InboundSchema, Provider]] = frozenset( # xai is NOT same-family despite speaking openai-chat: v1's transform # touches the body (tools strict strip, non-user message name strip), # so a verbatim fast-path forward would diverge from v1. + # The compat_sdk family is NOT same-family either: their param maps + # touch the body (mct -> max_tokens renames, supported-list raises, + # together's response_format pop), so a verbatim forward would + # diverge from v1's gates. } ) diff --git a/litellm/translation/engine/pipeline.py b/litellm/translation/engine/pipeline.py index eeb09e0b72..14b62a72bd 100644 --- a/litellm/translation/engine/pipeline.py +++ b/litellm/translation/engine/pipeline.py @@ -53,6 +53,24 @@ from ..providers.bedrock_invoke import parse_response as bedrock_invoke_parse_re from ..providers.bedrock_invoke import ( serialize_request as bedrock_invoke_serialize_request, ) +from ..providers.compat_sdk import ( + cerebras_serialize_request, + featherless_ai_serialize_request, + hyperbolic_serialize_request, + lambda_ai_serialize_request, + llamafile_serialize_request, + lm_studio_serialize_request, + nebius_serialize_request, + novita_serialize_request, + nscale_serialize_request, + nvidia_nim_serialize_request, + together_ai_serialize_request, + volcengine_serialize_request, + wandb_serialize_request, +) +from ..providers.compat_sdk import ( + unsupported_request_shapes as compat_sdk_unsupported_request_shapes, +) from ..providers.google_genai import parse_response as google_parse_response from ..providers.google_genai import ( serialize_request_studio as google_serialize_request_studio, @@ -101,6 +119,19 @@ _SERIALIZERS: Mapping[Provider, _Serializer] = MappingProxyType( "azure_ai": azure_ai_serialize_request, "azure_ai_anthropic": azure_ai_claude_serialize_request, "xai": xai_serialize_request, + "together_ai": together_ai_serialize_request, + "cerebras": cerebras_serialize_request, + "nvidia_nim": nvidia_nim_serialize_request, + "lm_studio": lm_studio_serialize_request, + "llamafile": llamafile_serialize_request, + "lambda_ai": lambda_ai_serialize_request, + "nebius": nebius_serialize_request, + "novita": novita_serialize_request, + "wandb": wandb_serialize_request, + "featherless_ai": featherless_ai_serialize_request, + "nscale": nscale_serialize_request, + "hyperbolic": hyperbolic_serialize_request, + "volcengine": volcengine_serialize_request, } ) @@ -117,6 +148,23 @@ _RESPONSE_PARSERS: Mapping[Provider, _ResponseParser] = MappingProxyType( "azure_ai": azure_ai_parse_response, "azure_ai_anthropic": azure_ai_claude_parse_response, "xai": xai_parse_response, + # compat_sdk family: the live v1 normalizer is the same + # convert_to_model_response_object the openai parser mirrors; the + # {provider}/{wire_model} re-prefix is the seam's preset arm + # (_to_model_response_openai), not parser scope. + "together_ai": openai_compat_parse_response, + "cerebras": openai_compat_parse_response, + "nvidia_nim": openai_compat_parse_response, + "lm_studio": openai_compat_parse_response, + "llamafile": openai_compat_parse_response, + "lambda_ai": openai_compat_parse_response, + "nebius": openai_compat_parse_response, + "novita": openai_compat_parse_response, + "wandb": openai_compat_parse_response, + "featherless_ai": openai_compat_parse_response, + "nscale": openai_compat_parse_response, + "hyperbolic": openai_compat_parse_response, + "volcengine": openai_compat_parse_response, } ) @@ -133,6 +181,21 @@ _RESPONSE_DIALECTS: Mapping[Provider, ResponseDialect] = MappingProxyType( "azure_ai": "openai", "azure_ai_anthropic": "anthropic", # genuine anthropic wire format "xai": "openai", # httpx path, same normalized wire-body ride + # compat_sdk family: SDK path, default openai wrapper arm (the + # per-provider stream replays pin that no dedicated branch fires) + "together_ai": "openai", + "cerebras": "openai", + "nvidia_nim": "openai", + "lm_studio": "openai", + "llamafile": "openai", + "lambda_ai": "openai", + "nebius": "openai", + "novita": "openai", + "wandb": "openai", + "featherless_ai": "openai", + "nscale": "openai", + "hyperbolic": "openai", + "volcengine": "openai", } ) @@ -151,6 +214,19 @@ _RAW_GUARDS: Mapping[Provider, _RawGuard] = MappingProxyType( "vertex_ai": google_unsupported_request_shapes, "gemini": google_unsupported_request_shapes, "xai": xai_unsupported_request_shapes, + "together_ai": compat_sdk_unsupported_request_shapes, + "cerebras": compat_sdk_unsupported_request_shapes, + "nvidia_nim": compat_sdk_unsupported_request_shapes, + "lm_studio": compat_sdk_unsupported_request_shapes, + "llamafile": compat_sdk_unsupported_request_shapes, + "lambda_ai": compat_sdk_unsupported_request_shapes, + "nebius": compat_sdk_unsupported_request_shapes, + "novita": compat_sdk_unsupported_request_shapes, + "wandb": compat_sdk_unsupported_request_shapes, + "featherless_ai": compat_sdk_unsupported_request_shapes, + "nscale": compat_sdk_unsupported_request_shapes, + "hyperbolic": compat_sdk_unsupported_request_shapes, + "volcengine": compat_sdk_unsupported_request_shapes, } ) diff --git a/litellm/translation/providers/compat_sdk/__init__.py b/litellm/translation/providers/compat_sdk/__init__.py new file mode 100644 index 0000000000..81ef51cd79 --- /dev/null +++ b/litellm/translation/providers/compat_sdk/__init__.py @@ -0,0 +1,35 @@ +from ..openai_compat.response import parse_response +from .guard import unsupported_request_shapes +from .serialize import ( + cerebras_serialize_request, + featherless_ai_serialize_request, + hyperbolic_serialize_request, + lambda_ai_serialize_request, + llamafile_serialize_request, + lm_studio_serialize_request, + nebius_serialize_request, + novita_serialize_request, + nscale_serialize_request, + nvidia_nim_serialize_request, + together_ai_serialize_request, + volcengine_serialize_request, + wandb_serialize_request, +) + +__all__ = ( + "cerebras_serialize_request", + "featherless_ai_serialize_request", + "hyperbolic_serialize_request", + "lambda_ai_serialize_request", + "llamafile_serialize_request", + "lm_studio_serialize_request", + "nebius_serialize_request", + "novita_serialize_request", + "nscale_serialize_request", + "nvidia_nim_serialize_request", + "parse_response", + "together_ai_serialize_request", + "unsupported_request_shapes", + "volcengine_serialize_request", + "wandb_serialize_request", +) diff --git a/litellm/translation/providers/compat_sdk/guard.py b/litellm/translation/providers/compat_sdk/guard.py new file mode 100644 index 0000000000..a63a2c7f40 --- /dev/null +++ b/litellm/translation/providers/compat_sdk/guard.py @@ -0,0 +1,33 @@ +"""Raw-shape fidelity guard for the SDK-path openai-compat family. + +One family-wide arm before the shared openai guard: an explicit +``stream: false``. On this path ``completion()`` forwards the caller's False +into ``get_optional_params`` (non-default against the ``None`` default), it +lands in optional_params, and the SDK serializes the key onto the wire — +while the IR cannot represent absent-vs-false (verified in-process at HEAD; +the same arm the azure and xai guards carry). + +The openai guard runs with its full message-``name`` fallback: none of the +family configs strips names (only xai does), so v1 forwards ``name`` +verbatim on every role. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +from ...errors import TranslationError +from ..openai_compat.guard import ( + unsupported_request_shapes as openai_unsupported_request_shapes, +) + +_Raw = Mapping[str, object] + + +def unsupported_request_shapes(raw: _Raw) -> TranslationError | None: + if "stream" in raw and raw.get("stream") is False: + return TranslationError.of_unsupported( + "explicit stream: false (the SDK path serializes the key onto " + "the wire; absent-vs-false is lost in the IR)" + ) + return openai_unsupported_request_shapes(raw) diff --git a/litellm/translation/providers/compat_sdk/params.py b/litellm/translation/providers/compat_sdk/params.py new file mode 100644 index 0000000000..86235970c4 --- /dev/null +++ b/litellm/translation/providers/compat_sdk/params.py @@ -0,0 +1,326 @@ +"""Per-provider parameter gates for the SDK-path openai-compat family. + +v1's gate for every provider here is ``_check_valid_arg`` over the provider +config's ``get_supported_openai_params``: an unsupported param RAISES +``UnsupportedParamsError`` unless ``drop_params``, in which case it is popped +BEFORE ``map_openai_params`` runs. v2 mirrors the SUPPORTED-LIST truth as +typed fallbacks (the v2-openai/xai precedent — never re-implement the +raise-vs-drop interplay): every IR-carried param a provider's list excludes +falls back so v1 serves its own raise or drop. Params outside the IR +(seed, penalties, logprobs, n, stream_options, ...) already fall back at the +inbound boundary and never reach these gates. + +The capability reads mirror v1's model-map lookups through +``deps.supports_capability`` over the ``{provider}/{model}`` map key — the +provider prefix is LOAD-BEARING (bare wire models have no model-map rows; +see the xai drift-gate note). Verified in-process at HEAD: +``together_ai/...`` rows answer ``supports_function_calling`` and +``cerebras/...`` rows answer ``supports_reasoning``; the bare keys are False +even for capable models. +""" + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from types import MappingProxyType + +from ...deps import TranslationDeps +from ...ir import ChatRequest +from ..openai_compat.params import unsupported_response_format + +_Present = Callable[[ChatRequest], bool] + +# IR-carried params, checked in a stable order so fallback reasons are +# deterministic. ``max_tokens`` only counts as caller-sent when +# ``max_completion_tokens`` is absent: the inbound parse collapses mct into +# max_tokens and the raw guard rejects requests carrying both keys. +_CHECKS: tuple[tuple[str, _Present], ...] = ( + ( + "max_tokens", + lambda r: r.params.max_tokens.is_some() + and r.params.max_completion_tokens.is_none(), + ), + ("max_completion_tokens", lambda r: r.params.max_completion_tokens.is_some()), + ("temperature", lambda r: r.params.temperature.is_some()), + ("top_p", lambda r: r.params.top_p.is_some()), + ("top_k", lambda r: r.params.top_k.is_some()), + ("stream", lambda r: r.stream), + ("stop", lambda r: len(r.params.stop) > 0), + ("tools", lambda r: len(r.tools) > 0), + ("tool_choice", lambda r: r.tool_choice.is_some()), + ("parallel_tool_calls", lambda r: r.parallel_tool_calls.is_some()), + ("response_format", lambda r: r.response_format.is_some()), + ("user", lambda r: r.user.is_some()), + ("reasoning_effort", lambda r: r.reasoning_effort.is_some()), + ("thinking", lambda r: r.thinking.is_some()), +) + +_NO_NOTES: Mapping[str, str] = MappingProxyType({}) + + +def unsupported_against( + request: ChatRequest, + *, + provider: str, + allowed: frozenset[str], + notes: Mapping[str, str] = _NO_NOTES, +) -> str | None: + for key, present in _CHECKS: + if not present(request) or key in allowed: + continue + note = notes.get(key) + if note is not None: + return note + return ( + f"{key} on {provider}: outside v1's supported list; " + "get_optional_params raises UnsupportedParamsError " + "(or drops it under drop_params)" + ) + return None + + +def _user_note(provider: str) -> str: + return ( + f"user on {provider}: gated on litellm.open_ai_chat_completion_models " + "membership in v1's base supported list; v1 handles it" + ) + + +# OpenAIGPTConfig's base list restricted to IR-carried keys. ``user`` is +# deliberately absent (model-list gated in v1) and ``response_format`` rides +# the base list's gpt-4/gpt-3.5-turbo-16k name gate, applied per provider +# below for the configs that inherit the base list. +_BASE_LIST = frozenset( + { + "max_tokens", + "max_completion_tokens", + "temperature", + "top_p", + "stream", + "stop", + "tools", + "tool_choice", + "parallel_tool_calls", + "response_format", + } +) + +_FUNCTION_CALLING_KEYS = frozenset({"tools", "tool_choice", "response_format"}) + + +def base_list_unsupported( + request: ChatRequest, deps: TranslationDeps, provider: str +) -> str | None: + """llamafile / novita / lm_studio (plain base config) and lambda_ai / + nebius / wandb (base list + mct rename, applied in serialize).""" + return unsupported_against( + request, + provider=provider, + allowed=_BASE_LIST, + notes={"user": _user_note(provider)}, + ) or unsupported_response_format(request) + + +def supports_together_tools(model: str, deps: TranslationDeps) -> bool: + return deps.supports_capability(f"together_ai/{model}", "supports_function_calling") + + +def together_ai_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + """TogetherAIConfig removes tools/tool_choice/response_format from the + base list unless ``supports_function_calling(model, "together_ai")`` is + True (together_ai/chat.py); parallel_tool_calls stays supported either + way (v1 truth, not an oversight here).""" + allowed = ( + _BASE_LIST + if supports_together_tools(request.model, deps) + else _BASE_LIST - _FUNCTION_CALLING_KEYS + ) + return unsupported_against( + request, + provider="together_ai", + allowed=allowed, + notes={"user": _user_note("together_ai")}, + ) or unsupported_response_format(request) + + +_CEREBRAS_LIST = frozenset( + { + "max_tokens", + "max_completion_tokens", + "temperature", + "top_p", + "stream", + "stop", + "tools", + "tool_choice", + "response_format", + "user", + } +) + + +def supports_cerebras_reasoning(model: str, deps: TranslationDeps) -> bool: + return deps.supports_capability(f"cerebras/{model}", "supports_reasoning") + + +def cerebras_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + allowed = ( + _CEREBRAS_LIST | {"reasoning_effort"} + if supports_cerebras_reasoning(request.model, deps) + else _CEREBRAS_LIST + ) + return unsupported_against( + request, + provider="cerebras", + allowed=allowed, + notes={ + "reasoning_effort": ( + f"reasoning_effort on non-reasoning cerebras model {request.model} " + "(model-map supports_reasoning gate); v1 raises or drops it" + ) + }, + ) + + +# NvidiaNimConfig's static per-model allowlists (nvidia_nim/chat/ +# transformation.py), restricted to IR-carried keys; the drift gate re-derives +# them from the v1 config at HEAD. +_NVIDIA_GEMMA_MODELS = frozenset( + { + "google/recurrentgemma-2b", + "google/gemma-2-27b-it", + "google/gemma-2-9b-it", + "gemma-2-9b-it", + } +) +_NVIDIA_GEMMA_LIST = frozenset({"stream", "temperature", "top_p", "max_tokens", "stop"}) +_NVIDIA_NEMOTRON_INSTRUCT_LIST = frozenset( + {"stream", "temperature", "top_p", "max_tokens", "max_completion_tokens"} +) +_NVIDIA_REWARD_LIST = frozenset({"stream"}) +_NVIDIA_CODEGEMMA_LIST = frozenset( + {"stream", "temperature", "top_p", "max_tokens", "max_completion_tokens", "stop"} +) +_NVIDIA_DEFAULT_LIST = frozenset( + { + "stream", + "temperature", + "top_p", + "max_tokens", + "max_completion_tokens", + "stop", + "tools", + "tool_choice", + "parallel_tool_calls", + "response_format", + } +) + + +def nvidia_nim_allowed(model: str) -> frozenset[str]: + if model in _NVIDIA_GEMMA_MODELS: + return _NVIDIA_GEMMA_LIST + if model == "nvidia/nemotron-4-340b-instruct": + return _NVIDIA_NEMOTRON_INSTRUCT_LIST + if model == "nvidia/nemotron-4-340b-reward": + return _NVIDIA_REWARD_LIST + if model == "google/codegemma-1.1-7b": + return _NVIDIA_CODEGEMMA_LIST + return _NVIDIA_DEFAULT_LIST + + +def nvidia_nim_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + return unsupported_against( + request, provider="nvidia_nim", allowed=nvidia_nim_allowed(request.model) + ) + + +_FEATHERLESS_LIST = frozenset( + { + "max_tokens", + "max_completion_tokens", + "temperature", + "top_p", + "stream", + "stop", + } +) + + +def featherless_ai_unsupported( + request: ChatRequest, deps: TranslationDeps +) -> str | None: + """tools and tool_choice are outside FeatherlessAIConfig's supported + list, so ``_check_valid_arg`` raises before the map's tool_choice + auto/none arm can run — that arm is dead code (the xai R2 pattern, + verified in-process at HEAD).""" + return unsupported_against( + request, provider="featherless_ai", allowed=_FEATHERLESS_LIST + ) + + +_NSCALE_LIST = frozenset( + {"max_tokens", "temperature", "top_p", "stream", "stop", "response_format"} +) + + +def nscale_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + """NscaleConfig inherits the BASE map (no mct rename), and mct is outside + its supported list, so max_completion_tokens RAISES (verified at HEAD).""" + return unsupported_against(request, provider="nscale", allowed=_NSCALE_LIST) + + +_HYPERBOLIC_LIST = frozenset( + { + "max_tokens", + "temperature", + "top_p", + "stream", + "stop", + "tools", + "tool_choice", + "response_format", + "user", + } +) + + +def hyperbolic_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + """max_completion_tokens is outside HyperbolicChatConfig's own list, so + OpenAILikeChatConfig's rename arm is dead code — v1 raises (verified at + HEAD; the xai R2 trap again).""" + return unsupported_against(request, provider="hyperbolic", allowed=_HYPERBOLIC_LIST) + + +_VOLCENGINE_LIST = frozenset( + { + "max_tokens", + "max_completion_tokens", + "temperature", + "top_p", + "stream", + "stop", + "tools", + "tool_choice", + } +) + + +def volcengine_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None: + """response_format is OUTSIDE VolcEngineChatConfig's supported list + (v1 raises, verified at HEAD). ``thinking`` IS supported in v1 but its + map packs the verbatim dict into ``extra_body`` for the SDK to merge + top-level — an unported crossing, so it falls back typed.""" + return unsupported_against( + request, + provider="volcengine", + allowed=_VOLCENGINE_LIST, + notes={ + "thinking": ( + "thinking on volcengine: v1 packs the verbatim dict into " + "extra_body (VolcEngineChatConfig.map_openai_params) and the " + "SDK merges it top-level; that crossing is unported, v1 " + "serves it" + ) + }, + ) diff --git a/litellm/translation/providers/compat_sdk/serialize.py b/litellm/translation/providers/compat_sdk/serialize.py new file mode 100644 index 0000000000..a6a622e2ef --- /dev/null +++ b/litellm/translation/providers/compat_sdk/serialize.py @@ -0,0 +1,229 @@ +"""Serializers for the SDK-path openai-compat family (wave 1a). + +Every provider here rides v1's big openai elif (main.py:2646-2667) into the +OpenAI SDK: ``get_optional_params`` runs the provider config's param gates, +``provider_config.transform_request`` (openai.py:727) runs the inherited +base five-touch assembly, and none of the family overrides it. The v2 body +is therefore ``openai_compat.assemble_body`` after the provider's gates, +plus at most three mechanical deltas captured per provider in a frozen +``CompatProfile``: + +- ``rename_max_completion_tokens``: the configs whose map renames mct -> + max_tokens (cerebras/nvidia_nim/nebius/wandb/featherless_ai and the + OpenAILike-based lambda_ai/volcengine). The IR already collapses mct into + ``max_tokens``, so the rename is emitting that collapsed key. +- ``emit_user``: providers whose own supported list carries ``user`` + unconditionally (cerebras, hyperbolic) — a typed fallback everywhere else. +- ``drop_text_response_format``: together_ai's map pops a verbatim + ``{"type": "text"}`` response_format. +- ``emit_reasoning_effort``: cerebras, gated per model in params.py before + emission ever happens. + +The response side needs NO per-provider code: the live v1 normalizer is the +same ``convert_to_model_response_object`` the openai_compat parser mirrors, +and the ``{provider}/{wire_model}`` model re-prefix (openai.py:676-677 + +cdr:699-710) is the SEAM's ``_to_model_response_openai`` preset arm, pinned +per provider by the differential's preset-model rows. Streams ride the +default openai wrapper arm -> the ``"openai"`` chunk dialect (baseten is the +one would-be member that does NOT, and is dropped from the wave for it). +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass + +from expression import Error, Result + +from ...deps import TranslationDeps +from ...errors import TranslationError +from ...ir import Body, ChatRequest, PlainJson +from ..openai_compat.serialize import assemble_body +from . import params as p + +_SerializeResult = Result[Body, TranslationError] +_GateFn = Callable[[ChatRequest, TranslationDeps], str | None] + + +@dataclass(frozen=True) +class CompatProfile: + provider: str + unsupported: _GateFn + rename_max_completion_tokens: bool = False + emit_user: bool = False + drop_text_response_format: bool = False + emit_reasoning_effort: bool = False + + +def serialize_with_profile( + request: ChatRequest, deps: TranslationDeps, profile: CompatProfile +) -> _SerializeResult: + reason = profile.unsupported(request, deps) + if reason is not None: + return Error(TranslationError.of_unsupported(reason)) + return assemble_body(request).map(lambda body: _with_deltas(body, request, profile)) + + +def _with_deltas(body: Body, request: ChatRequest, profile: CompatProfile) -> Body: + if profile.rename_max_completion_tokens and "max_completion_tokens" in body: + collapsed = request.params.max_tokens.default_value(None) + body = { + **{k: v for k, v in body.items() if k != "max_completion_tokens"}, + "max_tokens": collapsed, + } + if profile.drop_text_response_format and body.get("response_format") == { + "type": "text" + }: + body = {k: v for k, v in body.items() if k != "response_format"} + extras: dict[str, PlainJson] = {} + user = request.user.default_value(None) if profile.emit_user else None + if user is not None: + extras = {**extras, "user": user} + effort = ( + request.reasoning_effort.default_value(None) + if profile.emit_reasoning_effort + else None + ) + if effort is not None: + extras = {**extras, "reasoning_effort": effort} + return {**body, **extras} if extras else body + + +def _base_list_gate(provider: str) -> _GateFn: + def gate(request: ChatRequest, deps: TranslationDeps) -> str | None: + return p.base_list_unsupported(request, deps, provider) + + return gate + + +TOGETHER_AI = CompatProfile( + provider="together_ai", + unsupported=p.together_ai_unsupported, + drop_text_response_format=True, +) +CEREBRAS = CompatProfile( + provider="cerebras", + unsupported=p.cerebras_unsupported, + rename_max_completion_tokens=True, + emit_user=True, + emit_reasoning_effort=True, +) +NVIDIA_NIM = CompatProfile( + provider="nvidia_nim", + unsupported=p.nvidia_nim_unsupported, + rename_max_completion_tokens=True, +) +LM_STUDIO = CompatProfile( + provider="lm_studio", unsupported=_base_list_gate("lm_studio") +) +LLAMAFILE = CompatProfile( + provider="llamafile", unsupported=_base_list_gate("llamafile") +) +LAMBDA_AI = CompatProfile( + provider="lambda_ai", + unsupported=_base_list_gate("lambda_ai"), + rename_max_completion_tokens=True, +) +NEBIUS = CompatProfile( + provider="nebius", + unsupported=_base_list_gate("nebius"), + rename_max_completion_tokens=True, +) +NOVITA = CompatProfile(provider="novita", unsupported=_base_list_gate("novita")) +WANDB = CompatProfile( + provider="wandb", + unsupported=_base_list_gate("wandb"), + rename_max_completion_tokens=True, +) +FEATHERLESS_AI = CompatProfile( + provider="featherless_ai", + unsupported=p.featherless_ai_unsupported, + rename_max_completion_tokens=True, +) +NSCALE = CompatProfile(provider="nscale", unsupported=p.nscale_unsupported) +HYPERBOLIC = CompatProfile( + provider="hyperbolic", unsupported=p.hyperbolic_unsupported, emit_user=True +) +VOLCENGINE = CompatProfile( + provider="volcengine", + unsupported=p.volcengine_unsupported, + rename_max_completion_tokens=True, +) + + +def together_ai_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, TOGETHER_AI) + + +def cerebras_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, CEREBRAS) + + +def nvidia_nim_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, NVIDIA_NIM) + + +def lm_studio_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, LM_STUDIO) + + +def llamafile_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, LLAMAFILE) + + +def lambda_ai_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, LAMBDA_AI) + + +def nebius_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, NEBIUS) + + +def novita_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, NOVITA) + + +def wandb_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, WANDB) + + +def featherless_ai_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, FEATHERLESS_AI) + + +def nscale_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, NSCALE) + + +def hyperbolic_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, HYPERBOLIC) + + +def volcengine_serialize_request( + request: ChatRequest, deps: TranslationDeps +) -> _SerializeResult: + return serialize_with_profile(request, deps, VOLCENGINE)