mirror of
https://github.com/tiennm99/litellm.git
synced 2026-07-04 23:07:50 +00:00
feat(translation): port the 13 wave-1a SDK-path openai-compat providers via one compat_sdk family
together_ai, cerebras, nvidia_nim, lm_studio, llamafile, lambda_ai, nebius,
novita, wandb, featherless_ai, nscale, hyperbolic, volcengine. One family
subpackage (the google_genai one-family precedent): per-provider supported
list truths as pure gates (v1 raises UnsupportedParamsError unless
drop_params on anything off the list, so exclusions are typed fallbacks),
frozen CompatProfile deltas (mct rename, user emission, together's rf-text
drop, cerebras capability-gated reasoning_effort over the load-bearing
cerebras/{model} map key), openai_compat assemble_body for the body and
parse_response verbatim for responses; the {provider}/{wire_model}
re-prefix stays the seam's preset arm. Family raw guard adds the explicit
stream:false arm (the SDK serializes the key; absent-vs-false is lost in
the IR). baseten is dropped from the wave: its streams ride the dedicated
legacy handle_baseten_chunk wrapper branch, not the openai dialect, so it
stays an unregistered typed v1 fallback
This commit is contained in:
@@ -148,6 +148,38 @@ translation/
|
||||
│ │ └── claude.py # anthropic serializer/parsers re-exported; NO
|
||||
│ │ # response-format model spoof (v1 maps with the
|
||||
│ │ # real model); billing-header blocks fail closed
|
||||
│ ├── compat_sdk/ # the wave-1a SDK-path openai-compat family in ONE
|
||||
│ │ │ # subpackage (the google_genai "one family,
|
||||
│ │ │ # parameterized" precedent): together_ai,
|
||||
│ │ │ # cerebras, nvidia_nim, lm_studio, llamafile,
|
||||
│ │ │ # lambda_ai, nebius, novita, wandb,
|
||||
│ │ │ # featherless_ai, nscale, hyperbolic, volcengine.
|
||||
│ │ │ # All ride v1's big openai elif into the SDK, so
|
||||
│ │ │ # the body is openai_compat.assemble_body after
|
||||
│ │ │ # per-provider gates; the response parser is
|
||||
│ │ │ # openai_compat's verbatim (same live normalizer)
|
||||
│ │ │ # and the {provider}/{wire_model} re-prefix is
|
||||
│ │ │ # the SEAM's preset arm, never parser scope;
|
||||
│ │ │ # streams are the "openai" chunk dialect (pinned
|
||||
│ │ │ # per provider by wrapper replays). baseten is
|
||||
│ │ │ # DELIBERATELY ABSENT: its streams ride a
|
||||
│ │ │ # dedicated legacy wrapper branch
|
||||
│ │ │ # (handle_baseten_chunk), so it stays a typed v1
|
||||
│ │ │ # fallback (canary-pinned).
|
||||
│ │ ├── params.py # per-provider supported-list truths as pure gates
|
||||
│ │ │ # (v1 RAISES-unless-drop_params on anything off
|
||||
│ │ │ # the list); capability gates read deps over the
|
||||
│ │ │ # LOAD-BEARING {provider}/{model} map keys
|
||||
│ │ │ # (together_ai function calling, cerebras
|
||||
│ │ │ # reasoning); nvidia_nim's static per-model table
|
||||
│ │ ├── serialize.py # frozen CompatProfile per provider (mct rename /
|
||||
│ │ │ # user emission / together's rf-text drop /
|
||||
│ │ │ # cerebras reasoning_effort) -> gates ->
|
||||
│ │ │ # openai_compat assemble_body -> deltas
|
||||
│ │ └── guard.py # explicit stream:false (the SDK serializes the
|
||||
│ │ # key; absent-vs-false is lost in the IR), then
|
||||
│ │ # the shared openai guard with the full
|
||||
│ │ # message-name fallback (nobody here strips names)
|
||||
│ └── xai/ # Grok over openai_compat (httpx path: NO model
|
||||
│ │ # prefix anywhere, transform_response is LIVE):
|
||||
│ ├── guard.py # web_search_options (v1's Responses-bridge reroute
|
||||
@@ -300,10 +332,13 @@ A behavior change ships as its own snapshot-diffed PR, never inside a port.
|
||||
|
||||
## Current scope
|
||||
|
||||
OpenAI-chat-in to eleven providers out — `anthropic`, `bedrock_converse`,
|
||||
`bedrock_invoke`, `openai_compat`, `vertex_ai` (gemini route), `gemini`
|
||||
(AI Studio), `vertex_anthropic`, `azure`, `azure_ai`,
|
||||
`azure_ai_anthropic`, `xai` — request, response, and stream translation,
|
||||
OpenAI-chat-in to twenty-four providers out — `anthropic`,
|
||||
`bedrock_converse`, `bedrock_invoke`, `openai_compat`, `vertex_ai` (gemini
|
||||
route), `gemini` (AI Studio), `vertex_anthropic`, `azure`, `azure_ai`,
|
||||
`azure_ai_anthropic`, `xai`, and the thirteen wave-1a compat_sdk providers
|
||||
(`together_ai`, `cerebras`, `nvidia_nim`, `lm_studio`, `llamafile`,
|
||||
`lambda_ai`, `nebius`, `novita`, `wandb`, `featherless_ai`, `nscale`,
|
||||
`hyperbolic`, `volcengine`) — request, response, and stream translation,
|
||||
differential-green (anthropic: 46-shape corpus + responses + stream
|
||||
replays; bedrock and google: the characterization corpus per route + quirk
|
||||
corpora; openai: 17-shape request corpus + 17 typed-fallback rows +
|
||||
@@ -314,7 +349,12 @@ azure_ai: the Foundry override-set and no-spoof Claude-route corpora;
|
||||
xai: a 21-shape generated characterization corpus — provenance v1
|
||||
in-process at HEAD, zero recorded vendor fixtures exist — two-sided over
|
||||
`tests/test_litellm/translation/characterization_xai/` plus 7 rows pinning
|
||||
v1's UnsupportedParamsError raises and the line-seam stream replays),
|
||||
v1's UnsupportedParamsError raises and the line-seam stream replays;
|
||||
compat_sdk: per-provider generated corpora vs v1 in-process at HEAD
|
||||
(`test_differential_compat_sdk_{request,response,stream}.py` over
|
||||
`_compat_sdk_corpus.py` — served rows, UnsupportedParamsError raise rows,
|
||||
preset-model re-prefix response rows, per-provider wrapper stream replays,
|
||||
and supported-list mirror drift gates over every model-map row)),
|
||||
fail-closed everywhere else, with non-streaming flag-gated seams live in
|
||||
`completion()` for the anthropic, bedrock, and google routes (the
|
||||
openai/azure seam forks are integrator scope and NOT wired; the google
|
||||
@@ -389,7 +429,39 @@ definitions with `strict` keys below the function level (v1 deletes every
|
||||
depth), the openai guard's raw shapes, and the parse-level unknowns v1
|
||||
passes through for grok (presence/frequency penalties on supported
|
||||
families, seed, logprobs, top_logprobs, logit_bias, n, stream_options,
|
||||
web_search_options). The xai completion() fork is NOT wired (integrator
|
||||
web_search_options).
|
||||
Deliberate compat_sdk (wave-1a) fallback surfaces (each names the v1
|
||||
path): every IR-carried param a provider's supported list excludes — v1's
|
||||
`_check_valid_arg` raises UnsupportedParamsError or drops under
|
||||
drop_params, so the typed fallback serves v1's own behavior
|
||||
(max_completion_tokens on nscale/hyperbolic — both rename arms are dead
|
||||
code behind the list gate; tools/tool_choice/response_format on
|
||||
featherless_ai, on volcengine (response_format only), on together_ai
|
||||
models without the supports_function_calling map flag, and on
|
||||
nvidia_nim's reduced static-table models; parallel_tool_calls on
|
||||
cerebras/featherless_ai/nscale/hyperbolic/volcengine; reasoning_effort
|
||||
everywhere except capability-flagged cerebras models; response_format on
|
||||
base-list providers when the model is literally named gpt-4 /
|
||||
gpt-3.5-turbo-16k); `user` wherever a provider's own list doesn't carry
|
||||
it (v1's base list gates it on openai model-list membership and silently
|
||||
drops it otherwise — only cerebras and hyperbolic emit it);
|
||||
volcengine `thinking` (v1 packs the verbatim dict into extra_body for the
|
||||
SDK to merge top-level); lm_studio's bare-`schema` response_format wrap
|
||||
(non-canonical inbound shape, parse rejects it); explicit `stream: false`
|
||||
(the SDK serializes the key; absent-vs-false is lost in the IR); the
|
||||
openai guard's raw shapes with the FULL message-name fallback (no config
|
||||
here strips names); and the parse-level unknowns (seed, penalties,
|
||||
logprobs, n, logit_bias, stream_options, ...) regardless of whether a
|
||||
given provider's list serves or raises on them. The compat_sdk
|
||||
completion() forks are NOT wired (integrator scope, like openai/azure/
|
||||
xai); when they land, the seam must pre-set
|
||||
`ModelResponse(model=f"{provider}/{model}")` exactly like openai.py:
|
||||
676-677 so `_to_model_response_openai`'s re-prefix arm reproduces
|
||||
cdr:699-710 (pinned per provider by the preset-model differential rows),
|
||||
and baseten must NEVER be added to the family without resolving its
|
||||
dedicated legacy wrapper stream branch (handle_baseten_chunk — the
|
||||
test_baseten_drop_canary evidence).
|
||||
The xai completion() fork is NOT wired (integrator
|
||||
scope, like openai/azure); when it lands these are HARD OBLIGATIONS, not
|
||||
notes: the in-package `use_xai_oauth` guard arm is defense-in-depth ONLY
|
||||
and unreachable through `_raw_openai_body` (use_xai_oauth is a litellm
|
||||
|
||||
@@ -32,6 +32,23 @@ Provider = Literal[
|
||||
"gemini",
|
||||
"vertex_anthropic",
|
||||
"xai",
|
||||
# wave-1a: the SDK-path openai-compat family (providers/compat_sdk).
|
||||
# baseten is deliberately ABSENT: its streams ride a dedicated legacy
|
||||
# CustomStreamWrapper branch (handle_baseten_chunk), not the openai
|
||||
# dialect, so it stays a typed v1 fallback (wave1a-port.md).
|
||||
"together_ai",
|
||||
"cerebras",
|
||||
"nvidia_nim",
|
||||
"lm_studio",
|
||||
"llamafile",
|
||||
"lambda_ai",
|
||||
"nebius",
|
||||
"novita",
|
||||
"wandb",
|
||||
"featherless_ai",
|
||||
"nscale",
|
||||
"hyperbolic",
|
||||
"volcengine",
|
||||
]
|
||||
|
||||
_SAME_FAMILY: frozenset[tuple[InboundSchema, Provider]] = frozenset(
|
||||
@@ -44,6 +61,10 @@ _SAME_FAMILY: frozenset[tuple[InboundSchema, Provider]] = frozenset(
|
||||
# xai is NOT same-family despite speaking openai-chat: v1's transform
|
||||
# touches the body (tools strict strip, non-user message name strip),
|
||||
# so a verbatim fast-path forward would diverge from v1.
|
||||
# The compat_sdk family is NOT same-family either: their param maps
|
||||
# touch the body (mct -> max_tokens renames, supported-list raises,
|
||||
# together's response_format pop), so a verbatim forward would
|
||||
# diverge from v1's gates.
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -53,6 +53,24 @@ from ..providers.bedrock_invoke import parse_response as bedrock_invoke_parse_re
|
||||
from ..providers.bedrock_invoke import (
|
||||
serialize_request as bedrock_invoke_serialize_request,
|
||||
)
|
||||
from ..providers.compat_sdk import (
|
||||
cerebras_serialize_request,
|
||||
featherless_ai_serialize_request,
|
||||
hyperbolic_serialize_request,
|
||||
lambda_ai_serialize_request,
|
||||
llamafile_serialize_request,
|
||||
lm_studio_serialize_request,
|
||||
nebius_serialize_request,
|
||||
novita_serialize_request,
|
||||
nscale_serialize_request,
|
||||
nvidia_nim_serialize_request,
|
||||
together_ai_serialize_request,
|
||||
volcengine_serialize_request,
|
||||
wandb_serialize_request,
|
||||
)
|
||||
from ..providers.compat_sdk import (
|
||||
unsupported_request_shapes as compat_sdk_unsupported_request_shapes,
|
||||
)
|
||||
from ..providers.google_genai import parse_response as google_parse_response
|
||||
from ..providers.google_genai import (
|
||||
serialize_request_studio as google_serialize_request_studio,
|
||||
@@ -101,6 +119,19 @@ _SERIALIZERS: Mapping[Provider, _Serializer] = MappingProxyType(
|
||||
"azure_ai": azure_ai_serialize_request,
|
||||
"azure_ai_anthropic": azure_ai_claude_serialize_request,
|
||||
"xai": xai_serialize_request,
|
||||
"together_ai": together_ai_serialize_request,
|
||||
"cerebras": cerebras_serialize_request,
|
||||
"nvidia_nim": nvidia_nim_serialize_request,
|
||||
"lm_studio": lm_studio_serialize_request,
|
||||
"llamafile": llamafile_serialize_request,
|
||||
"lambda_ai": lambda_ai_serialize_request,
|
||||
"nebius": nebius_serialize_request,
|
||||
"novita": novita_serialize_request,
|
||||
"wandb": wandb_serialize_request,
|
||||
"featherless_ai": featherless_ai_serialize_request,
|
||||
"nscale": nscale_serialize_request,
|
||||
"hyperbolic": hyperbolic_serialize_request,
|
||||
"volcengine": volcengine_serialize_request,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -117,6 +148,23 @@ _RESPONSE_PARSERS: Mapping[Provider, _ResponseParser] = MappingProxyType(
|
||||
"azure_ai": azure_ai_parse_response,
|
||||
"azure_ai_anthropic": azure_ai_claude_parse_response,
|
||||
"xai": xai_parse_response,
|
||||
# compat_sdk family: the live v1 normalizer is the same
|
||||
# convert_to_model_response_object the openai parser mirrors; the
|
||||
# {provider}/{wire_model} re-prefix is the seam's preset arm
|
||||
# (_to_model_response_openai), not parser scope.
|
||||
"together_ai": openai_compat_parse_response,
|
||||
"cerebras": openai_compat_parse_response,
|
||||
"nvidia_nim": openai_compat_parse_response,
|
||||
"lm_studio": openai_compat_parse_response,
|
||||
"llamafile": openai_compat_parse_response,
|
||||
"lambda_ai": openai_compat_parse_response,
|
||||
"nebius": openai_compat_parse_response,
|
||||
"novita": openai_compat_parse_response,
|
||||
"wandb": openai_compat_parse_response,
|
||||
"featherless_ai": openai_compat_parse_response,
|
||||
"nscale": openai_compat_parse_response,
|
||||
"hyperbolic": openai_compat_parse_response,
|
||||
"volcengine": openai_compat_parse_response,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -133,6 +181,21 @@ _RESPONSE_DIALECTS: Mapping[Provider, ResponseDialect] = MappingProxyType(
|
||||
"azure_ai": "openai",
|
||||
"azure_ai_anthropic": "anthropic", # genuine anthropic wire format
|
||||
"xai": "openai", # httpx path, same normalized wire-body ride
|
||||
# compat_sdk family: SDK path, default openai wrapper arm (the
|
||||
# per-provider stream replays pin that no dedicated branch fires)
|
||||
"together_ai": "openai",
|
||||
"cerebras": "openai",
|
||||
"nvidia_nim": "openai",
|
||||
"lm_studio": "openai",
|
||||
"llamafile": "openai",
|
||||
"lambda_ai": "openai",
|
||||
"nebius": "openai",
|
||||
"novita": "openai",
|
||||
"wandb": "openai",
|
||||
"featherless_ai": "openai",
|
||||
"nscale": "openai",
|
||||
"hyperbolic": "openai",
|
||||
"volcengine": "openai",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -151,6 +214,19 @@ _RAW_GUARDS: Mapping[Provider, _RawGuard] = MappingProxyType(
|
||||
"vertex_ai": google_unsupported_request_shapes,
|
||||
"gemini": google_unsupported_request_shapes,
|
||||
"xai": xai_unsupported_request_shapes,
|
||||
"together_ai": compat_sdk_unsupported_request_shapes,
|
||||
"cerebras": compat_sdk_unsupported_request_shapes,
|
||||
"nvidia_nim": compat_sdk_unsupported_request_shapes,
|
||||
"lm_studio": compat_sdk_unsupported_request_shapes,
|
||||
"llamafile": compat_sdk_unsupported_request_shapes,
|
||||
"lambda_ai": compat_sdk_unsupported_request_shapes,
|
||||
"nebius": compat_sdk_unsupported_request_shapes,
|
||||
"novita": compat_sdk_unsupported_request_shapes,
|
||||
"wandb": compat_sdk_unsupported_request_shapes,
|
||||
"featherless_ai": compat_sdk_unsupported_request_shapes,
|
||||
"nscale": compat_sdk_unsupported_request_shapes,
|
||||
"hyperbolic": compat_sdk_unsupported_request_shapes,
|
||||
"volcengine": compat_sdk_unsupported_request_shapes,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
from ..openai_compat.response import parse_response
|
||||
from .guard import unsupported_request_shapes
|
||||
from .serialize import (
|
||||
cerebras_serialize_request,
|
||||
featherless_ai_serialize_request,
|
||||
hyperbolic_serialize_request,
|
||||
lambda_ai_serialize_request,
|
||||
llamafile_serialize_request,
|
||||
lm_studio_serialize_request,
|
||||
nebius_serialize_request,
|
||||
novita_serialize_request,
|
||||
nscale_serialize_request,
|
||||
nvidia_nim_serialize_request,
|
||||
together_ai_serialize_request,
|
||||
volcengine_serialize_request,
|
||||
wandb_serialize_request,
|
||||
)
|
||||
|
||||
__all__ = (
|
||||
"cerebras_serialize_request",
|
||||
"featherless_ai_serialize_request",
|
||||
"hyperbolic_serialize_request",
|
||||
"lambda_ai_serialize_request",
|
||||
"llamafile_serialize_request",
|
||||
"lm_studio_serialize_request",
|
||||
"nebius_serialize_request",
|
||||
"novita_serialize_request",
|
||||
"nscale_serialize_request",
|
||||
"nvidia_nim_serialize_request",
|
||||
"parse_response",
|
||||
"together_ai_serialize_request",
|
||||
"unsupported_request_shapes",
|
||||
"volcengine_serialize_request",
|
||||
"wandb_serialize_request",
|
||||
)
|
||||
@@ -0,0 +1,33 @@
|
||||
"""Raw-shape fidelity guard for the SDK-path openai-compat family.
|
||||
|
||||
One family-wide arm before the shared openai guard: an explicit
|
||||
``stream: false``. On this path ``completion()`` forwards the caller's False
|
||||
into ``get_optional_params`` (non-default against the ``None`` default), it
|
||||
lands in optional_params, and the SDK serializes the key onto the wire —
|
||||
while the IR cannot represent absent-vs-false (verified in-process at HEAD;
|
||||
the same arm the azure and xai guards carry).
|
||||
|
||||
The openai guard runs with its full message-``name`` fallback: none of the
|
||||
family configs strips names (only xai does), so v1 forwards ``name``
|
||||
verbatim on every role.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
|
||||
from ...errors import TranslationError
|
||||
from ..openai_compat.guard import (
|
||||
unsupported_request_shapes as openai_unsupported_request_shapes,
|
||||
)
|
||||
|
||||
_Raw = Mapping[str, object]
|
||||
|
||||
|
||||
def unsupported_request_shapes(raw: _Raw) -> TranslationError | None:
|
||||
if "stream" in raw and raw.get("stream") is False:
|
||||
return TranslationError.of_unsupported(
|
||||
"explicit stream: false (the SDK path serializes the key onto "
|
||||
"the wire; absent-vs-false is lost in the IR)"
|
||||
)
|
||||
return openai_unsupported_request_shapes(raw)
|
||||
@@ -0,0 +1,326 @@
|
||||
"""Per-provider parameter gates for the SDK-path openai-compat family.
|
||||
|
||||
v1's gate for every provider here is ``_check_valid_arg`` over the provider
|
||||
config's ``get_supported_openai_params``: an unsupported param RAISES
|
||||
``UnsupportedParamsError`` unless ``drop_params``, in which case it is popped
|
||||
BEFORE ``map_openai_params`` runs. v2 mirrors the SUPPORTED-LIST truth as
|
||||
typed fallbacks (the v2-openai/xai precedent — never re-implement the
|
||||
raise-vs-drop interplay): every IR-carried param a provider's list excludes
|
||||
falls back so v1 serves its own raise or drop. Params outside the IR
|
||||
(seed, penalties, logprobs, n, stream_options, ...) already fall back at the
|
||||
inbound boundary and never reach these gates.
|
||||
|
||||
The capability reads mirror v1's model-map lookups through
|
||||
``deps.supports_capability`` over the ``{provider}/{model}`` map key — the
|
||||
provider prefix is LOAD-BEARING (bare wire models have no model-map rows;
|
||||
see the xai drift-gate note). Verified in-process at HEAD:
|
||||
``together_ai/...`` rows answer ``supports_function_calling`` and
|
||||
``cerebras/...`` rows answer ``supports_reasoning``; the bare keys are False
|
||||
even for capable models.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable, Mapping
|
||||
from types import MappingProxyType
|
||||
|
||||
from ...deps import TranslationDeps
|
||||
from ...ir import ChatRequest
|
||||
from ..openai_compat.params import unsupported_response_format
|
||||
|
||||
_Present = Callable[[ChatRequest], bool]
|
||||
|
||||
# IR-carried params, checked in a stable order so fallback reasons are
|
||||
# deterministic. ``max_tokens`` only counts as caller-sent when
|
||||
# ``max_completion_tokens`` is absent: the inbound parse collapses mct into
|
||||
# max_tokens and the raw guard rejects requests carrying both keys.
|
||||
_CHECKS: tuple[tuple[str, _Present], ...] = (
|
||||
(
|
||||
"max_tokens",
|
||||
lambda r: r.params.max_tokens.is_some()
|
||||
and r.params.max_completion_tokens.is_none(),
|
||||
),
|
||||
("max_completion_tokens", lambda r: r.params.max_completion_tokens.is_some()),
|
||||
("temperature", lambda r: r.params.temperature.is_some()),
|
||||
("top_p", lambda r: r.params.top_p.is_some()),
|
||||
("top_k", lambda r: r.params.top_k.is_some()),
|
||||
("stream", lambda r: r.stream),
|
||||
("stop", lambda r: len(r.params.stop) > 0),
|
||||
("tools", lambda r: len(r.tools) > 0),
|
||||
("tool_choice", lambda r: r.tool_choice.is_some()),
|
||||
("parallel_tool_calls", lambda r: r.parallel_tool_calls.is_some()),
|
||||
("response_format", lambda r: r.response_format.is_some()),
|
||||
("user", lambda r: r.user.is_some()),
|
||||
("reasoning_effort", lambda r: r.reasoning_effort.is_some()),
|
||||
("thinking", lambda r: r.thinking.is_some()),
|
||||
)
|
||||
|
||||
_NO_NOTES: Mapping[str, str] = MappingProxyType({})
|
||||
|
||||
|
||||
def unsupported_against(
|
||||
request: ChatRequest,
|
||||
*,
|
||||
provider: str,
|
||||
allowed: frozenset[str],
|
||||
notes: Mapping[str, str] = _NO_NOTES,
|
||||
) -> str | None:
|
||||
for key, present in _CHECKS:
|
||||
if not present(request) or key in allowed:
|
||||
continue
|
||||
note = notes.get(key)
|
||||
if note is not None:
|
||||
return note
|
||||
return (
|
||||
f"{key} on {provider}: outside v1's supported list; "
|
||||
"get_optional_params raises UnsupportedParamsError "
|
||||
"(or drops it under drop_params)"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _user_note(provider: str) -> str:
|
||||
return (
|
||||
f"user on {provider}: gated on litellm.open_ai_chat_completion_models "
|
||||
"membership in v1's base supported list; v1 handles it"
|
||||
)
|
||||
|
||||
|
||||
# OpenAIGPTConfig's base list restricted to IR-carried keys. ``user`` is
|
||||
# deliberately absent (model-list gated in v1) and ``response_format`` rides
|
||||
# the base list's gpt-4/gpt-3.5-turbo-16k name gate, applied per provider
|
||||
# below for the configs that inherit the base list.
|
||||
_BASE_LIST = frozenset(
|
||||
{
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"stop",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"parallel_tool_calls",
|
||||
"response_format",
|
||||
}
|
||||
)
|
||||
|
||||
_FUNCTION_CALLING_KEYS = frozenset({"tools", "tool_choice", "response_format"})
|
||||
|
||||
|
||||
def base_list_unsupported(
|
||||
request: ChatRequest, deps: TranslationDeps, provider: str
|
||||
) -> str | None:
|
||||
"""llamafile / novita / lm_studio (plain base config) and lambda_ai /
|
||||
nebius / wandb (base list + mct rename, applied in serialize)."""
|
||||
return unsupported_against(
|
||||
request,
|
||||
provider=provider,
|
||||
allowed=_BASE_LIST,
|
||||
notes={"user": _user_note(provider)},
|
||||
) or unsupported_response_format(request)
|
||||
|
||||
|
||||
def supports_together_tools(model: str, deps: TranslationDeps) -> bool:
|
||||
return deps.supports_capability(f"together_ai/{model}", "supports_function_calling")
|
||||
|
||||
|
||||
def together_ai_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
"""TogetherAIConfig removes tools/tool_choice/response_format from the
|
||||
base list unless ``supports_function_calling(model, "together_ai")`` is
|
||||
True (together_ai/chat.py); parallel_tool_calls stays supported either
|
||||
way (v1 truth, not an oversight here)."""
|
||||
allowed = (
|
||||
_BASE_LIST
|
||||
if supports_together_tools(request.model, deps)
|
||||
else _BASE_LIST - _FUNCTION_CALLING_KEYS
|
||||
)
|
||||
return unsupported_against(
|
||||
request,
|
||||
provider="together_ai",
|
||||
allowed=allowed,
|
||||
notes={"user": _user_note("together_ai")},
|
||||
) or unsupported_response_format(request)
|
||||
|
||||
|
||||
_CEREBRAS_LIST = frozenset(
|
||||
{
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"stop",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"response_format",
|
||||
"user",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def supports_cerebras_reasoning(model: str, deps: TranslationDeps) -> bool:
|
||||
return deps.supports_capability(f"cerebras/{model}", "supports_reasoning")
|
||||
|
||||
|
||||
def cerebras_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
allowed = (
|
||||
_CEREBRAS_LIST | {"reasoning_effort"}
|
||||
if supports_cerebras_reasoning(request.model, deps)
|
||||
else _CEREBRAS_LIST
|
||||
)
|
||||
return unsupported_against(
|
||||
request,
|
||||
provider="cerebras",
|
||||
allowed=allowed,
|
||||
notes={
|
||||
"reasoning_effort": (
|
||||
f"reasoning_effort on non-reasoning cerebras model {request.model} "
|
||||
"(model-map supports_reasoning gate); v1 raises or drops it"
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# NvidiaNimConfig's static per-model allowlists (nvidia_nim/chat/
|
||||
# transformation.py), restricted to IR-carried keys; the drift gate re-derives
|
||||
# them from the v1 config at HEAD.
|
||||
_NVIDIA_GEMMA_MODELS = frozenset(
|
||||
{
|
||||
"google/recurrentgemma-2b",
|
||||
"google/gemma-2-27b-it",
|
||||
"google/gemma-2-9b-it",
|
||||
"gemma-2-9b-it",
|
||||
}
|
||||
)
|
||||
_NVIDIA_GEMMA_LIST = frozenset({"stream", "temperature", "top_p", "max_tokens", "stop"})
|
||||
_NVIDIA_NEMOTRON_INSTRUCT_LIST = frozenset(
|
||||
{"stream", "temperature", "top_p", "max_tokens", "max_completion_tokens"}
|
||||
)
|
||||
_NVIDIA_REWARD_LIST = frozenset({"stream"})
|
||||
_NVIDIA_CODEGEMMA_LIST = frozenset(
|
||||
{"stream", "temperature", "top_p", "max_tokens", "max_completion_tokens", "stop"}
|
||||
)
|
||||
_NVIDIA_DEFAULT_LIST = frozenset(
|
||||
{
|
||||
"stream",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"stop",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"parallel_tool_calls",
|
||||
"response_format",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def nvidia_nim_allowed(model: str) -> frozenset[str]:
|
||||
if model in _NVIDIA_GEMMA_MODELS:
|
||||
return _NVIDIA_GEMMA_LIST
|
||||
if model == "nvidia/nemotron-4-340b-instruct":
|
||||
return _NVIDIA_NEMOTRON_INSTRUCT_LIST
|
||||
if model == "nvidia/nemotron-4-340b-reward":
|
||||
return _NVIDIA_REWARD_LIST
|
||||
if model == "google/codegemma-1.1-7b":
|
||||
return _NVIDIA_CODEGEMMA_LIST
|
||||
return _NVIDIA_DEFAULT_LIST
|
||||
|
||||
|
||||
def nvidia_nim_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
return unsupported_against(
|
||||
request, provider="nvidia_nim", allowed=nvidia_nim_allowed(request.model)
|
||||
)
|
||||
|
||||
|
||||
_FEATHERLESS_LIST = frozenset(
|
||||
{
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"stop",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def featherless_ai_unsupported(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> str | None:
|
||||
"""tools and tool_choice are outside FeatherlessAIConfig's supported
|
||||
list, so ``_check_valid_arg`` raises before the map's tool_choice
|
||||
auto/none arm can run — that arm is dead code (the xai R2 pattern,
|
||||
verified in-process at HEAD)."""
|
||||
return unsupported_against(
|
||||
request, provider="featherless_ai", allowed=_FEATHERLESS_LIST
|
||||
)
|
||||
|
||||
|
||||
_NSCALE_LIST = frozenset(
|
||||
{"max_tokens", "temperature", "top_p", "stream", "stop", "response_format"}
|
||||
)
|
||||
|
||||
|
||||
def nscale_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
"""NscaleConfig inherits the BASE map (no mct rename), and mct is outside
|
||||
its supported list, so max_completion_tokens RAISES (verified at HEAD)."""
|
||||
return unsupported_against(request, provider="nscale", allowed=_NSCALE_LIST)
|
||||
|
||||
|
||||
_HYPERBOLIC_LIST = frozenset(
|
||||
{
|
||||
"max_tokens",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"stop",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"response_format",
|
||||
"user",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def hyperbolic_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
"""max_completion_tokens is outside HyperbolicChatConfig's own list, so
|
||||
OpenAILikeChatConfig's rename arm is dead code — v1 raises (verified at
|
||||
HEAD; the xai R2 trap again)."""
|
||||
return unsupported_against(request, provider="hyperbolic", allowed=_HYPERBOLIC_LIST)
|
||||
|
||||
|
||||
_VOLCENGINE_LIST = frozenset(
|
||||
{
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"stop",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def volcengine_unsupported(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
"""response_format is OUTSIDE VolcEngineChatConfig's supported list
|
||||
(v1 raises, verified at HEAD). ``thinking`` IS supported in v1 but its
|
||||
map packs the verbatim dict into ``extra_body`` for the SDK to merge
|
||||
top-level — an unported crossing, so it falls back typed."""
|
||||
return unsupported_against(
|
||||
request,
|
||||
provider="volcengine",
|
||||
allowed=_VOLCENGINE_LIST,
|
||||
notes={
|
||||
"thinking": (
|
||||
"thinking on volcengine: v1 packs the verbatim dict into "
|
||||
"extra_body (VolcEngineChatConfig.map_openai_params) and the "
|
||||
"SDK merges it top-level; that crossing is unported, v1 "
|
||||
"serves it"
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -0,0 +1,229 @@
|
||||
"""Serializers for the SDK-path openai-compat family (wave 1a).
|
||||
|
||||
Every provider here rides v1's big openai elif (main.py:2646-2667) into the
|
||||
OpenAI SDK: ``get_optional_params`` runs the provider config's param gates,
|
||||
``provider_config.transform_request`` (openai.py:727) runs the inherited
|
||||
base five-touch assembly, and none of the family overrides it. The v2 body
|
||||
is therefore ``openai_compat.assemble_body`` after the provider's gates,
|
||||
plus at most three mechanical deltas captured per provider in a frozen
|
||||
``CompatProfile``:
|
||||
|
||||
- ``rename_max_completion_tokens``: the configs whose map renames mct ->
|
||||
max_tokens (cerebras/nvidia_nim/nebius/wandb/featherless_ai and the
|
||||
OpenAILike-based lambda_ai/volcengine). The IR already collapses mct into
|
||||
``max_tokens``, so the rename is emitting that collapsed key.
|
||||
- ``emit_user``: providers whose own supported list carries ``user``
|
||||
unconditionally (cerebras, hyperbolic) — a typed fallback everywhere else.
|
||||
- ``drop_text_response_format``: together_ai's map pops a verbatim
|
||||
``{"type": "text"}`` response_format.
|
||||
- ``emit_reasoning_effort``: cerebras, gated per model in params.py before
|
||||
emission ever happens.
|
||||
|
||||
The response side needs NO per-provider code: the live v1 normalizer is the
|
||||
same ``convert_to_model_response_object`` the openai_compat parser mirrors,
|
||||
and the ``{provider}/{wire_model}`` model re-prefix (openai.py:676-677 +
|
||||
cdr:699-710) is the SEAM's ``_to_model_response_openai`` preset arm, pinned
|
||||
per provider by the differential's preset-model rows. Streams ride the
|
||||
default openai wrapper arm -> the ``"openai"`` chunk dialect (baseten is the
|
||||
one would-be member that does NOT, and is dropped from the wave for it).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
from expression import Error, Result
|
||||
|
||||
from ...deps import TranslationDeps
|
||||
from ...errors import TranslationError
|
||||
from ...ir import Body, ChatRequest, PlainJson
|
||||
from ..openai_compat.serialize import assemble_body
|
||||
from . import params as p
|
||||
|
||||
_SerializeResult = Result[Body, TranslationError]
|
||||
_GateFn = Callable[[ChatRequest, TranslationDeps], str | None]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompatProfile:
|
||||
provider: str
|
||||
unsupported: _GateFn
|
||||
rename_max_completion_tokens: bool = False
|
||||
emit_user: bool = False
|
||||
drop_text_response_format: bool = False
|
||||
emit_reasoning_effort: bool = False
|
||||
|
||||
|
||||
def serialize_with_profile(
|
||||
request: ChatRequest, deps: TranslationDeps, profile: CompatProfile
|
||||
) -> _SerializeResult:
|
||||
reason = profile.unsupported(request, deps)
|
||||
if reason is not None:
|
||||
return Error(TranslationError.of_unsupported(reason))
|
||||
return assemble_body(request).map(lambda body: _with_deltas(body, request, profile))
|
||||
|
||||
|
||||
def _with_deltas(body: Body, request: ChatRequest, profile: CompatProfile) -> Body:
|
||||
if profile.rename_max_completion_tokens and "max_completion_tokens" in body:
|
||||
collapsed = request.params.max_tokens.default_value(None)
|
||||
body = {
|
||||
**{k: v for k, v in body.items() if k != "max_completion_tokens"},
|
||||
"max_tokens": collapsed,
|
||||
}
|
||||
if profile.drop_text_response_format and body.get("response_format") == {
|
||||
"type": "text"
|
||||
}:
|
||||
body = {k: v for k, v in body.items() if k != "response_format"}
|
||||
extras: dict[str, PlainJson] = {}
|
||||
user = request.user.default_value(None) if profile.emit_user else None
|
||||
if user is not None:
|
||||
extras = {**extras, "user": user}
|
||||
effort = (
|
||||
request.reasoning_effort.default_value(None)
|
||||
if profile.emit_reasoning_effort
|
||||
else None
|
||||
)
|
||||
if effort is not None:
|
||||
extras = {**extras, "reasoning_effort": effort}
|
||||
return {**body, **extras} if extras else body
|
||||
|
||||
|
||||
def _base_list_gate(provider: str) -> _GateFn:
|
||||
def gate(request: ChatRequest, deps: TranslationDeps) -> str | None:
|
||||
return p.base_list_unsupported(request, deps, provider)
|
||||
|
||||
return gate
|
||||
|
||||
|
||||
TOGETHER_AI = CompatProfile(
|
||||
provider="together_ai",
|
||||
unsupported=p.together_ai_unsupported,
|
||||
drop_text_response_format=True,
|
||||
)
|
||||
CEREBRAS = CompatProfile(
|
||||
provider="cerebras",
|
||||
unsupported=p.cerebras_unsupported,
|
||||
rename_max_completion_tokens=True,
|
||||
emit_user=True,
|
||||
emit_reasoning_effort=True,
|
||||
)
|
||||
NVIDIA_NIM = CompatProfile(
|
||||
provider="nvidia_nim",
|
||||
unsupported=p.nvidia_nim_unsupported,
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
LM_STUDIO = CompatProfile(
|
||||
provider="lm_studio", unsupported=_base_list_gate("lm_studio")
|
||||
)
|
||||
LLAMAFILE = CompatProfile(
|
||||
provider="llamafile", unsupported=_base_list_gate("llamafile")
|
||||
)
|
||||
LAMBDA_AI = CompatProfile(
|
||||
provider="lambda_ai",
|
||||
unsupported=_base_list_gate("lambda_ai"),
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
NEBIUS = CompatProfile(
|
||||
provider="nebius",
|
||||
unsupported=_base_list_gate("nebius"),
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
NOVITA = CompatProfile(provider="novita", unsupported=_base_list_gate("novita"))
|
||||
WANDB = CompatProfile(
|
||||
provider="wandb",
|
||||
unsupported=_base_list_gate("wandb"),
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
FEATHERLESS_AI = CompatProfile(
|
||||
provider="featherless_ai",
|
||||
unsupported=p.featherless_ai_unsupported,
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
NSCALE = CompatProfile(provider="nscale", unsupported=p.nscale_unsupported)
|
||||
HYPERBOLIC = CompatProfile(
|
||||
provider="hyperbolic", unsupported=p.hyperbolic_unsupported, emit_user=True
|
||||
)
|
||||
VOLCENGINE = CompatProfile(
|
||||
provider="volcengine",
|
||||
unsupported=p.volcengine_unsupported,
|
||||
rename_max_completion_tokens=True,
|
||||
)
|
||||
|
||||
|
||||
def together_ai_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, TOGETHER_AI)
|
||||
|
||||
|
||||
def cerebras_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, CEREBRAS)
|
||||
|
||||
|
||||
def nvidia_nim_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, NVIDIA_NIM)
|
||||
|
||||
|
||||
def lm_studio_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, LM_STUDIO)
|
||||
|
||||
|
||||
def llamafile_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, LLAMAFILE)
|
||||
|
||||
|
||||
def lambda_ai_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, LAMBDA_AI)
|
||||
|
||||
|
||||
def nebius_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, NEBIUS)
|
||||
|
||||
|
||||
def novita_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, NOVITA)
|
||||
|
||||
|
||||
def wandb_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, WANDB)
|
||||
|
||||
|
||||
def featherless_ai_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, FEATHERLESS_AI)
|
||||
|
||||
|
||||
def nscale_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, NSCALE)
|
||||
|
||||
|
||||
def hyperbolic_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, HYPERBOLIC)
|
||||
|
||||
|
||||
def volcengine_serialize_request(
|
||||
request: ChatRequest, deps: TranslationDeps
|
||||
) -> _SerializeResult:
|
||||
return serialize_with_profile(request, deps, VOLCENGINE)
|
||||
Reference in New Issue
Block a user