feat(audio_transcription): add NVIDIA Riva STT provider (#27185)

* feat(audio_transcription): add NVIDIA Riva STT provider Adds nvidia_riva as a new audio transcription provider, supporting both NVCF-hosted and self-hosted Riva ASR deployments via gRPC streaming. - Auto-resamples input audio to 16 kHz mono LINEAR_PCM (soundfile + numpy, audioread fallback) so callers can send any common format. - Maps OpenAI params: language (en -> en-US), response_format (text/json/ verbose_json), timestamp_granularities=["word"] -> enable_word_time_offsets, word offsets converted ms -> s for verbose_json. - Auth: NVCF when nvcf_function_id is set (SSL on by default), self-hosted otherwise (SSL off by default), with explicit use_ssl override. - gRPC errors wrapped via NvidiaRivaException -> litellm exception classes. - Optional deps gated behind [stt-nvidia-riva] extra (nvidia-riva-client, soundfile, audioread, numpy). Co-authored-by: Cursor <cursoragent@cursor.com> * fix(nvidia_riva): address PR review feedback - handler: forward call-level `timeout` to streaming_response_generator (kwarg-detected via inspect for older riva-client compat) so a stalled Riva server cannot block the caller indefinitely. - audio_utils: spill bytes to a tempfile before audioread.audio_open; most audioread backends (FFmpeg, GStreamer) require a real filesystem path and previously raised TypeError on BytesIO, breaking the mp3/m4a fallback path. - audio_utils: prefer soxr / scipy.signal.resample_poly for resampling (anti-aliased polyphase) when installed, falling back to linear only as a last resort. Avoids aliasing on 44.1/48 kHz -> 16 kHz downsamples. - transformation: bare `es` now maps to es-ES (Castilian) instead of es-US, matching BCP-47 conventions. Co-authored-by: Cursor <cursoragent@cursor.com> * chore: trigger CI re-run [stabilize loop 1/3] * Update litellm/llms/nvidia_riva/audio_transcription/transformation.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * chore: trigger CI re-run [stabilize loop 1/3] * fix code qa * fix lint * fix mypy * fix mypy * Fix NVIDIA Riva ASR service lookup * Fix NVIDIA Riva transcription payload logging --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: oss-pr-review-agent-shin[bot] <281797381+oss-pr-review-agent-shin[bot]@users.noreply.github.com> Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Co-authored-by: mateo-berri <277851410+mateo-berri@users.noreply.github.com>
2026-08-02 06:22:48 +00:00 · 2026-05-05 17:17:51 -07:00
parent 454ce5073f
commit e912e6d4ff
20 changed files with 2120 additions and 2 deletions
@@ -586,6 +586,7 @@ anyscale_models: Set = set()
 cerebras_models: Set = set()
 galadriel_models: Set = set()
 nvidia_nim_models: Set = set()
+nvidia_riva_models: Set = set()
 sambanova_models: Set = set()
 sambanova_embedding_models: Set = set()
 novita_models: Set = set()
@@ -812,6 +813,8 @@ def add_known_models(model_cost_map: Optional[Dict] = None):
            galadriel_models.add(key)
        elif value.get("litellm_provider") == "nvidia_nim":
            nvidia_nim_models.add(key)
+        elif value.get("litellm_provider") == "nvidia_riva":
+            nvidia_riva_models.add(key)
        elif value.get("litellm_provider") == "sambanova":
            sambanova_models.add(key)
        elif value.get("litellm_provider") == "sambanova-embedding-models":
@@ -971,6 +974,7 @@ model_list = list(
    | cerebras_models
    | galadriel_models
    | nvidia_nim_models
+    | nvidia_riva_models
    | sambanova_models
    | azure_text_models
    | novita_models
@@ -1067,6 +1071,7 @@ models_by_provider: dict = {
    "cerebras": cerebras_models,
    "galadriel": galadriel_models,
    "nvidia_nim": nvidia_nim_models,
+    "nvidia_riva": nvidia_riva_models,
    "sambanova": sambanova_models | sambanova_embedding_models,
    "novita": novita_models,
    "nebius": nebius_models | nebius_embedding_models,
@@ -1618,6 +1623,9 @@ if TYPE_CHECKING:
    from .llms.deepgram.audio_transcription.transformation import (
        DeepgramAudioTranscriptionConfig as DeepgramAudioTranscriptionConfig,
    )
+    from .llms.nvidia_riva.audio_transcription.transformation import (
+        NvidiaRivaAudioTranscriptionConfig as NvidiaRivaAudioTranscriptionConfig,
+    )
    from .llms.topaz.image_variations.transformation import (
        TopazImageVariationConfig as TopazImageVariationConfig,
    )
@@ -621,6 +621,18 @@ def _get_openai_compatible_provider_info(  # noqa: PLR0915
            or "https://integrate.api.nvidia.com/v1"
        )  # type: ignore
        dynamic_api_key = api_key or get_secret_str("NVIDIA_NIM_API_KEY")
+    elif custom_llm_provider == "nvidia_riva":
+        # NVIDIA Riva is gRPC-based; api_base must be a host:port like
+        # `grpc.nvcf.nvidia.com:443` or `localhost:50051`. There is no
+        # public-default endpoint, so we do not fill one in here.
+        api_base = api_base or get_secret_str("NVIDIA_RIVA_API_BASE")  # type: ignore
+        # Fall back to NVIDIA_NIM_API_KEY because users running both NVCF
+        # services typically reuse the same nvapi-* key.
+        dynamic_api_key = (
+            api_key
+            or get_secret_str("NVIDIA_RIVA_API_KEY")
+            or get_secret_str("NVIDIA_NIM_API_KEY")
+        )
    elif custom_llm_provider == "cerebras":
        api_base = (
            api_base or get_secret("CEREBRAS_API_BASE") or "https://api.cerebras.ai/v1"
@@ -0,0 +1,232 @@
+"""
+Audio resampling utilities for the NVIDIA Riva STT provider.
+
+We intentionally avoid a hard dependency on ``ffmpeg`` so this works in
+slim Python environments. Format coverage:
+
+- ``soundfile`` handles wav / flac / ogg out of the box (libsndfile).
+- ``audioread`` is tried for everything ``soundfile`` cannot decode (mp3,
+  m4a, mp4, webm, ...). This is a soft optional dependency.
+
+If neither library can decode the input we raise a clear error instructing
+the caller to convert the audio upstream.
+"""
+
+import io
+import os
+import tempfile
+from dataclasses import dataclass
+from typing import Any, Tuple, cast
+
+from litellm.llms.nvidia_riva.audio_transcription.transformation import (
+    RIVA_TARGET_NUM_CHANNELS,
+    RIVA_TARGET_SAMPLE_RATE_HZ,
+)
+from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException
+
+# Keep this as Any: the module intentionally avoids importing numpy at module
+# import time (optional dependency), and project-wide mypy config evaluates this
+# file in contexts where conditional type aliases can degrade to "FloatArray?".
+FloatArray = Any
+
+
+_INSTALL_HINT = (
+    "Install Riva STT extras to enable automatic audio resampling: "
+    "`pip install 'litellm[stt-nvidia-riva]'`"
+)
+
+
+@dataclass
+class ResampledAudio:
+    pcm_bytes: bytes
+    duration_seconds: float
+    sample_rate_hz: int
+    num_channels: int
+
+
+def resample_to_riva_pcm(file_bytes: bytes) -> ResampledAudio:
+    """
+    Decode ``file_bytes`` and produce 16 kHz mono LINEAR_PCM (int16 little
+    endian) suitable for streaming to Riva, plus the audio duration in
+    seconds (used for cost calculation when Riva does not return usage).
+    """
+    try:
+        import numpy as np  # type: ignore
+    except ImportError as e:
+        raise NvidiaRivaException(
+            status_code=500,
+            message=f"numpy is required for Riva audio resampling. {_INSTALL_HINT}",
+        ) from e
+
+    samples_float, source_rate = _decode_to_float32(file_bytes)
+
+    # Downmix to mono by averaging channels.
+    if samples_float.ndim == 2 and samples_float.shape[1] > 1:
+        samples_float = samples_float.mean(axis=1)
+    elif samples_float.ndim == 2:
+        samples_float = samples_float[:, 0]
+
+    samples_float = np.asarray(samples_float, dtype=np.float32).ravel()
+
+    if source_rate != RIVA_TARGET_SAMPLE_RATE_HZ:
+        samples_float = _resample(
+            samples_float, source_rate, RIVA_TARGET_SAMPLE_RATE_HZ
+        )
+
+    # Clip + convert float [-1, 1] to int16 little-endian PCM.
+    np.clip(samples_float, -1.0, 1.0, out=samples_float)
+    pcm_int16 = (samples_float * 32767.0).astype("<i2")
+    pcm_bytes = pcm_int16.tobytes()
+
+    duration_seconds = float(pcm_int16.size) / float(RIVA_TARGET_SAMPLE_RATE_HZ)
+
+    return ResampledAudio(
+        pcm_bytes=pcm_bytes,
+        duration_seconds=duration_seconds,
+        sample_rate_hz=RIVA_TARGET_SAMPLE_RATE_HZ,
+        num_channels=RIVA_TARGET_NUM_CHANNELS,
+    )
+
+
+def _decode_to_float32(file_bytes: bytes) -> Tuple["FloatArray", int]:
+    """
+    Decode arbitrary audio bytes into a float32 array shaped either
+    ``(n_samples,)`` (mono) or ``(n_samples, n_channels)`` plus the source
+    sample rate.
+
+    Tries ``soundfile`` first (wav/flac/ogg), then falls back to
+    ``audioread`` for compressed formats. Raises a clear error if neither
+    works.
+    """
+    import numpy as np  # type: ignore
+
+    sf_error: Exception | None = None
+    try:
+        import soundfile as sf  # type: ignore
+
+        with io.BytesIO(file_bytes) as buf:
+            data, source_rate = sf.read(buf, dtype="float32", always_2d=False)
+        return cast("FloatArray", data), int(source_rate)
+    except ImportError as e:
+        sf_error = e
+    except Exception as e:
+        # soundfile raises RuntimeError / LibsndfileError for formats it
+        # cannot decode (mp3 on older libsndfile, m4a, webm, ...).
+        sf_error = e
+
+    try:
+        import audioread  # type: ignore
+    except ImportError as e:
+        raise NvidiaRivaException(
+            status_code=400,
+            message=(
+                "Could not decode audio for Riva STT. Install audio extras "
+                f"(`pip install 'litellm[stt-nvidia-riva]'`) or convert your "
+                f"audio to wav/flac/ogg before calling the API. "
+                f"Underlying error: {sf_error}"
+            ),
+        ) from e
+
+    # audioread backends (FFmpeg subprocess, GStreamer, Core Audio) require a
+    # filesystem path, so spill the bytes to a temp file. mkstemp is portable
+    # to Windows where re-opening a NamedTemporaryFile is not allowed.
+    fd, tmp_path = tempfile.mkstemp(suffix=".audio")
+    try:
+        with os.fdopen(fd, "wb") as tmp_file:
+            tmp_file.write(file_bytes)
+        try:
+            with audioread.audio_open(tmp_path) as src:
+                source_rate = int(src.samplerate)
+                channels = int(src.channels)
+                chunks = []
+                for buf in src:
+                    chunks.append(np.frombuffer(buf, dtype=np.int16))
+                if not chunks:
+                    raise NvidiaRivaException(
+                        status_code=400,
+                        message="Audio decode produced no samples.",
+                    )
+                interleaved = np.concatenate(chunks).astype(np.float32) / 32768.0
+                if channels > 1:
+                    interleaved = interleaved.reshape(-1, channels)
+                return cast("FloatArray", interleaved), source_rate
+        except NvidiaRivaException:
+            raise
+        except Exception as e:
+            raise NvidiaRivaException(
+                status_code=400,
+                message=(
+                    "Could not decode audio for Riva STT. Convert your audio to "
+                    f"wav/flac/ogg before calling the API. Underlying error: {e}"
+                ),
+            ) from e
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except OSError:
+            pass
+
+
+def _resample(
+    samples: "FloatArray", source_rate: int, target_rate: int
+) -> "FloatArray":
+    """
+    Resample mono float32 ``samples`` from ``source_rate`` to ``target_rate``.
+
+    Prefers high-quality polyphase resampling when ``soxr`` or ``scipy`` is
+    available (anti-aliased, important for downsampling 44.1/48 kHz -> 16 kHz
+    where naive interpolation folds high frequencies back into the speech
+    band). Falls back to linear interpolation if neither is installed —
+    acceptable for speech-only mono input but lossy for wideband content.
+    """
+    import numpy as np  # type: ignore
+
+    if source_rate == target_rate or samples.size == 0:
+        return samples
+
+    try:
+        import soxr  # type: ignore
+
+        return cast(
+            "FloatArray",
+            np.asarray(
+                soxr.resample(samples, source_rate, target_rate), dtype=np.float32
+            ),
+        )
+    except ImportError:
+        pass
+
+    try:
+        from math import gcd
+
+        from scipy.signal import resample_poly  # type: ignore
+
+        g = gcd(int(source_rate), int(target_rate))
+        up = int(target_rate) // g
+        down = int(source_rate) // g
+        return cast(
+            "FloatArray", np.asarray(resample_poly(samples, up, down), dtype=np.float32)
+        )
+    except ImportError:
+        pass
+
+    return _linear_resample(samples, source_rate, target_rate)
+
+
+def _linear_resample(
+    samples: "FloatArray", source_rate: int, target_rate: int
+) -> "FloatArray":
+    """Linear-interpolation fallback. See :func:`_resample` for caveats."""
+    import numpy as np  # type: ignore
+
+    duration = samples.size / float(source_rate)
+    target_length = int(round(duration * target_rate))
+    if target_length <= 1:
+        return samples.astype(np.float32)
+
+    src_indices = np.linspace(0, samples.size - 1, num=target_length, dtype=np.float64)
+    left = np.floor(src_indices).astype(np.int64)
+    right = np.minimum(left + 1, samples.size - 1)
+    frac = (src_indices - left).astype(np.float32)
+
+    return ((1.0 - frac) * samples[left] + frac * samples[right]).astype(np.float32)
@@ -0,0 +1,444 @@
+"""
+NVIDIA Riva STT handler.
+
+This module bridges litellm's transcription dispatch to NVIDIA Riva's gRPC
+streaming ASR API. We do *not* go through ``base_llm_http_handler`` because
+Riva is gRPC-only: HTTP-shaped abstractions (``httpx.Response``,
+``api_base/v1/...`` URLs, multipart bodies) do not apply.
+
+The handler is intentionally a thin orchestration layer:
+
+1. Resample the inbound audio to 16 kHz mono LINEAR_PCM (Riva's required
+   wire format).
+2. Build ``RecognitionConfig`` / ``StreamingRecognitionConfig`` protobufs
+   from the structured dict produced by
+   :class:`NvidiaRivaAudioTranscriptionConfig`.
+3. Construct ``riva.client.Auth`` honoring NVCF (function-id metadata + TLS)
+   vs self-hosted (any host:port, optional TLS) modes.
+4. Stream the audio through Riva's ``streaming_response_generator`` and
+   aggregate ``is_final`` results into a single transcript.
+5. Return a normalized ``TranscriptionResponse`` with ``duration`` exposed
+   on ``_hidden_params`` so cost calculation works.
+
+``riva-client`` is imported lazily so ``litellm`` core remains usable
+without the optional STT extras installed.
+"""
+
+import asyncio
+import inspect
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from litellm.litellm_core_utils.audio_utils.utils import (
+    get_audio_file_name,
+    process_audio_file,
+)
+from litellm.llms.nvidia_riva.audio_transcription.audio_utils import (
+    resample_to_riva_pcm,
+)
+from litellm.llms.nvidia_riva.audio_transcription.transformation import (
+    NvidiaRivaAudioTranscriptionConfig,
+    RIVA_TARGET_NUM_CHANNELS,
+    RIVA_TARGET_SAMPLE_RATE_HZ,
+)
+from litellm.llms.nvidia_riva.common_utils import (
+    NvidiaRivaException,
+    grpc_error_to_litellm_exception,
+)
+from litellm.types.utils import FileTypes, TranscriptionResponse
+from litellm.utils import convert_to_model_response_object
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import (
+        Logging as LiteLLMLoggingObj,
+    )
+
+# Stream audio to Riva in ~50 ms slices (1600 samples at 16 kHz). Matches
+# NVIDIA's recommended chunk size for streaming ASR — small enough for
+# responsive endpointing, large enough to keep per-RPC overhead low.
+_DEFAULT_CHUNK_SAMPLES = 1600
+_DEFAULT_CHUNK_BYTES = _DEFAULT_CHUNK_SAMPLES * 2  # int16 = 2 bytes/sample
+
+
+_RIVA_INSTALL_HINT = (
+    "NVIDIA Riva client is not installed. "
+    "Install with `pip install 'litellm[stt-nvidia-riva]'`."
+)
+
+
+class NvidiaRivaAudioTranscription:
+    """Sync + async entry point for Riva ASR."""
+
+    def audio_transcriptions(
+        self,
+        model: str,
+        audio_file: FileTypes,
+        optional_params: dict,
+        litellm_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        logging_obj: "LiteLLMLoggingObj",
+        api_key: Optional[str],
+        api_base: Optional[str],
+        atranscription: bool = False,
+        provider_config: Optional[NvidiaRivaAudioTranscriptionConfig] = None,
+    ):
+        if provider_config is None:
+            provider_config = NvidiaRivaAudioTranscriptionConfig()
+
+        if atranscription:
+            return self.async_audio_transcriptions(
+                model=model,
+                audio_file=audio_file,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                model_response=model_response,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                api_key=api_key,
+                api_base=api_base,
+                provider_config=provider_config,
+            )
+
+        return self._run_sync(
+            model=model,
+            audio_file=audio_file,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            model_response=model_response,
+            timeout=timeout,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            api_base=api_base,
+            provider_config=provider_config,
+            atranscription=atranscription,
+        )
+
+    async def async_audio_transcriptions(
+        self,
+        model: str,
+        audio_file: FileTypes,
+        optional_params: dict,
+        litellm_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        logging_obj: "LiteLLMLoggingObj",
+        api_key: Optional[str],
+        api_base: Optional[str],
+        provider_config: Optional[NvidiaRivaAudioTranscriptionConfig] = None,
+    ) -> TranscriptionResponse:
+        # ``riva-client`` exposes a sync streaming generator, so we offload
+        # the blocking call to a worker thread to keep the event loop free.
+        return await asyncio.to_thread(
+            self._run_sync,
+            model=model,
+            audio_file=audio_file,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            model_response=model_response,
+            timeout=timeout,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            api_base=api_base,
+            provider_config=provider_config or NvidiaRivaAudioTranscriptionConfig(),
+            atranscription=True,
+        )
+
+    def _run_sync(
+        self,
+        model: str,
+        audio_file: FileTypes,
+        optional_params: dict,
+        litellm_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        logging_obj: "LiteLLMLoggingObj",
+        api_key: Optional[str],
+        api_base: Optional[str],
+        provider_config: NvidiaRivaAudioTranscriptionConfig,
+        atranscription: bool = False,
+    ) -> TranscriptionResponse:
+        if not api_base:
+            raise NvidiaRivaException(
+                status_code=400,
+                message=(
+                    "NVIDIA Riva requires `api_base` (host:port for the gRPC "
+                    "endpoint, e.g. `grpc.nvcf.nvidia.com:443` or "
+                    "`localhost:50051`). Set it in litellm_params or via "
+                    "NVIDIA_RIVA_API_BASE."
+                ),
+            )
+
+        processed = process_audio_file(audio_file)
+        resampled = resample_to_riva_pcm(processed.file_content)
+
+        request_payload = provider_config.transform_audio_transcription_request(
+            model=model,
+            audio_file=audio_file,
+            optional_params=optional_params,
+            litellm_params={
+                **litellm_params,
+                "api_base": api_base,
+                "api_key": api_key,
+            },
+        ).data
+        if not isinstance(request_payload, dict):
+            raise NvidiaRivaException(
+                status_code=500,
+                message="NvidiaRivaAudioTranscriptionConfig produced an unexpected request payload type.",
+            )
+
+        recognition_config_dict: Dict[str, Any] = request_payload["recognition_config"]
+        # The wire format is fixed by our resampler; override anything stale
+        # the caller passed in so the gRPC config matches the bytes we send.
+        recognition_config_dict["sample_rate_hertz"] = RIVA_TARGET_SAMPLE_RATE_HZ
+        recognition_config_dict["audio_channel_count"] = RIVA_TARGET_NUM_CHANNELS
+        recognition_config_dict["encoding"] = "LINEAR_PCM"
+
+        response_format = request_payload.get("response_format") or "json"
+        timestamp_granularities = request_payload.get("timestamp_granularities")
+
+        riva_module, riva_asr_module = _import_riva()
+        auth_obj = self._construct_auth(
+            riva_module=riva_module,
+            api_base=api_base,
+            api_key=api_key,
+            optional_params=optional_params,
+        )
+
+        recognition_config = self._build_recognition_config_proto(
+            riva_asr_module=riva_asr_module,
+            recognition_config_dict=recognition_config_dict,
+        )
+        streaming_config = riva_asr_module.StreamingRecognitionConfig(
+            config=recognition_config, interim_results=False
+        )
+
+        logging_obj.pre_call(
+            input=None,
+            api_key=api_key,
+            additional_args={
+                "api_base": api_base,
+                "atranscription": atranscription,
+                "complete_input_dict": {
+                    "recognition_config": recognition_config_dict,
+                    "nvcf_function_id_set": bool(
+                        optional_params.get("nvcf_function_id")
+                    ),
+                    "use_ssl": optional_params.get("use_ssl"),
+                },
+            },
+        )
+
+        try:
+            asr_service = riva_module.ASRService(auth_obj)
+            audio_chunks = self._iter_audio_chunks(resampled.pcm_bytes)
+            stream_kwargs: Dict[str, Any] = {
+                "audio_chunks": audio_chunks,
+                "streaming_config": streaming_config,
+            }
+            # Forward the deadline so the stream cannot block forever if the
+            # server stalls. Older riva-client versions do not accept a
+            # ``timeout`` kwarg, so pass it only when supported.
+            if timeout is not None and self._supports_timeout_kwarg(
+                asr_service.streaming_response_generator
+            ):
+                stream_kwargs["timeout"] = float(timeout)
+            stream = asr_service.streaming_response_generator(**stream_kwargs)
+            final_results = self._collect_final_results(stream)
+        except NvidiaRivaException:
+            raise
+        except Exception as e:
+            raise grpc_error_to_litellm_exception(e) from e
+
+        transcription = NvidiaRivaAudioTranscriptionConfig.build_transcription_response(
+            final_results=final_results,
+            response_format=response_format,
+            duration_seconds=resampled.duration_seconds,
+            timestamp_granularities=timestamp_granularities,
+        )
+
+        stringified_response = dict(transcription)
+
+        logging_obj.post_call(
+            input=get_audio_file_name(audio_file),
+            api_key=api_key,
+            additional_args={"complete_input_dict": recognition_config_dict},
+            original_response=stringified_response,
+        )
+
+        hidden_params = {
+            "model": model,
+            "custom_llm_provider": "nvidia_riva",
+            "audio_transcription_duration": resampled.duration_seconds,
+        }
+
+        final_response: TranscriptionResponse = convert_to_model_response_object(  # type: ignore
+            response_object=stringified_response,
+            model_response_object=model_response,
+            hidden_params=hidden_params,
+            response_type="audio_transcription",
+        )
+
+        return final_response
+
+    def _construct_auth(
+        self,
+        riva_module: Any,
+        api_base: str,
+        api_key: Optional[str],
+        optional_params: dict,
+    ) -> Any:
+        """
+        Build a ``riva.client.Auth`` object.
+
+        - When ``nvcf_function_id`` is provided we attach the NVCF
+          ``function-id`` and bearer ``authorization`` metadata, and default
+          ``use_ssl`` to True (NVCF endpoints are TLS-only).
+        - Otherwise (self-hosted) we default ``use_ssl`` to False but still
+          honor an explicit override — self-hosted Riva behind an ingress
+          with TLS termination is a real deployment topology.
+        """
+        nvcf_function_id = optional_params.get("nvcf_function_id")
+        use_ssl_override = optional_params.get("use_ssl")
+        use_ssl = (
+            bool(use_ssl_override)
+            if use_ssl_override is not None
+            else bool(nvcf_function_id)
+        )
+
+        metadata: List[Tuple[str, str]] = []
+        if nvcf_function_id:
+            metadata.append(("function-id", str(nvcf_function_id)))
+        if api_key:
+            metadata.append(("authorization", f"Bearer {api_key}"))
+
+        try:
+            return riva_module.Auth(
+                uri=api_base, use_ssl=use_ssl, metadata_args=metadata
+            )
+        except TypeError:
+            # Older riva-client signatures used positional-only args.
+            return riva_module.Auth(None, use_ssl, api_base, metadata)
+
+    def _build_recognition_config_proto(
+        self, riva_asr_module: Any, recognition_config_dict: Dict[str, Any]
+    ):
+        encoding_name = (
+            recognition_config_dict.get("encoding") or "LINEAR_PCM"
+        ).upper()
+        encoding_enum = getattr(
+            riva_asr_module.AudioEncoding,
+            encoding_name,
+            riva_asr_module.AudioEncoding.LINEAR_PCM,
+        )
+
+        config = riva_asr_module.RecognitionConfig(
+            encoding=encoding_enum,
+            sample_rate_hertz=int(recognition_config_dict["sample_rate_hertz"]),
+            language_code=recognition_config_dict["language_code"],
+            audio_channel_count=int(recognition_config_dict["audio_channel_count"]),
+            enable_automatic_punctuation=bool(
+                recognition_config_dict.get("enable_automatic_punctuation", True)
+            ),
+            enable_word_time_offsets=bool(
+                recognition_config_dict.get("enable_word_time_offsets", False)
+            ),
+            max_alternatives=int(recognition_config_dict.get("max_alternatives", 1)),
+            model=recognition_config_dict.get("model", "") or "",
+            verbatim_transcripts=bool(
+                recognition_config_dict.get("verbatim_transcripts", False)
+            ),
+            profanity_filter=bool(
+                recognition_config_dict.get("profanity_filter", False)
+            ),
+        )
+
+        endpointing = recognition_config_dict.get("endpointing_config")
+        if isinstance(endpointing, dict) and endpointing:
+            try:
+                ep = riva_asr_module.EndpointingConfig(**endpointing)
+                config.endpointing_config.CopyFrom(ep)
+            except Exception:
+                # If the user supplied an unknown EndpointingConfig field
+                # (older Riva server), fall back to Riva's defaults rather
+                # than failing the whole request.
+                pass
+
+        return config
+
+    @staticmethod
+    def _supports_timeout_kwarg(callable_obj: Any) -> bool:
+        try:
+            sig = inspect.signature(callable_obj)
+        except (TypeError, ValueError):
+            return False
+        params = sig.parameters
+        if "timeout" in params:
+            return True
+        return any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
+
+    @staticmethod
+    def _iter_audio_chunks(pcm_bytes: bytes):
+        for offset in range(0, len(pcm_bytes), _DEFAULT_CHUNK_BYTES):
+            chunk = pcm_bytes[offset : offset + _DEFAULT_CHUNK_BYTES]
+            if not chunk:
+                continue
+            yield chunk
+
+    @staticmethod
+    def _collect_final_results(stream) -> List[Dict[str, Any]]:
+        """
+        Walk the gRPC stream, ignore empty / non-final chunks, and return a
+        list of normalized final-result dicts. Matching the user's note: the
+        ``id`` blocks with no ``results`` are streaming heartbeats and must
+        be skipped.
+        """
+        final_results: List[Dict[str, Any]] = []
+        for response in stream:
+            results = getattr(response, "results", None) or []
+            for result in results:
+                if not getattr(result, "is_final", False):
+                    continue
+                alternatives = getattr(result, "alternatives", None) or []
+                if not alternatives:
+                    continue
+                top = alternatives[0]
+                transcript = getattr(top, "transcript", "") or ""
+                words_proto = getattr(top, "words", None) or []
+                words = []
+                for word in words_proto:
+                    words.append(
+                        {
+                            "word": getattr(word, "word", ""),
+                            "start_time_ms": int(getattr(word, "start_time", 0) or 0),
+                            "end_time_ms": int(getattr(word, "end_time", 0) or 0),
+                        }
+                    )
+                final_results.append({"transcript": transcript, "words": words})
+        return final_results
+
+
+def _import_riva():
+    """
+    Lazy import of ``riva.client`` and ``riva.client.proto.riva_asr_pb2``.
+
+    We try the SDK first (preferred) and fall back to importing the proto
+    module separately when the SDK packaging changes between versions.
+    """
+    try:
+        import riva.client as riva_client  # type: ignore
+    except ImportError as e:
+        raise NvidiaRivaException(status_code=500, message=_RIVA_INSTALL_HINT) from e
+
+    riva_asr_module = riva_client
+    if not hasattr(riva_asr_module, "RecognitionConfig"):
+        try:
+            import riva.client.proto.riva_asr_pb2 as riva_asr_pb2  # type: ignore
+
+            riva_asr_module = riva_asr_pb2
+        except ImportError as e:
+            raise NvidiaRivaException(
+                status_code=500, message=_RIVA_INSTALL_HINT
+            ) from e
+
+    return riva_client, riva_asr_module
@@ -0,0 +1,284 @@
+"""
+Translates from OpenAI's `/v1/audio/transcriptions` to NVIDIA Riva's gRPC
+streaming recognize API.
+
+Riva is gRPC-only, so unlike most providers in this directory the request
+"transformation" produced here is a structured dict consumed directly by the
+gRPC handler (rather than HTTP form-data). The handler builds Riva
+``RecognitionConfig`` / ``StreamingRecognitionConfig`` protobufs from this
+dict at call time.
+
+Reference: https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-overview.html
+"""
+
+from typing import Any, Dict, List, Optional, Union
+
+from httpx import Headers, Response
+
+from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.types.llms.openai import (
+    AllMessageValues,
+    OpenAIAudioTranscriptionOptionalParams,
+)
+from litellm.types.utils import FileTypes, TranscriptionResponse
+
+from ...base_llm.audio_transcription.transformation import (
+    AudioTranscriptionRequestData,
+    BaseAudioTranscriptionConfig,
+)
+from ..common_utils import NvidiaRivaException
+
+# Riva expects a fixed wire format for the audio chunks we stream in.
+RIVA_TARGET_SAMPLE_RATE_HZ = 16000
+RIVA_TARGET_NUM_CHANNELS = 1
+RIVA_TARGET_ENCODING = "LINEAR_PCM"
+
+
+class NvidiaRivaAudioTranscriptionConfig(BaseAudioTranscriptionConfig):
+    """
+    Config for NVIDIA Riva ASR (gRPC).
+
+    Supports both NVCF-hosted (``api_base=grpc.nvcf.nvidia.com:443`` +
+    ``nvcf_function_id``) and self-hosted deployments (any ``host:port``,
+    optional TLS via ``use_ssl``).
+    """
+
+    def get_supported_openai_params(
+        self, model: str
+    ) -> List[OpenAIAudioTranscriptionOptionalParams]:
+        # Riva natively understands language + word timestamps.
+        # `response_format` is honored at response-shaping time in the handler.
+        return ["language", "response_format", "timestamp_granularities"]
+
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        for key, value in non_default_params.items():
+            if value is None:
+                continue
+
+            if key == "language":
+                optional_params["language_code"] = self._normalize_language_code(value)
+            elif key == "timestamp_granularities":
+                # OpenAI accepts ["word"], ["segment"], or both. Riva only
+                # natively exposes word timing, so we toggle it on whenever
+                # "word" is requested. Segment timing is reconstructed in the
+                # response transformer.
+                if isinstance(value, list) and "word" in value:
+                    optional_params["enable_word_time_offsets"] = True
+                optional_params["timestamp_granularities"] = value
+            elif key == "response_format":
+                # Stored verbatim; consumed by transform_audio_transcription_response.
+                optional_params["response_format"] = value
+            else:
+                optional_params[key] = value
+
+        return optional_params
+
+    def get_error_class(
+        self, error_message: str, status_code: int, headers: Union[dict, Headers]
+    ) -> BaseLLMException:
+        return NvidiaRivaException(
+            message=error_message, status_code=status_code, headers=headers
+        )
+
+    def transform_audio_transcription_request(
+        self,
+        model: str,
+        audio_file: FileTypes,
+        optional_params: dict,
+        litellm_params: dict,
+    ) -> AudioTranscriptionRequestData:
+        """
+        Build a structured dict that the gRPC handler consumes. We do *not*
+        construct protobufs here, so this module remains importable without
+        ``nvidia-riva-client`` being installed (matching how other providers
+        defer SDK imports to handler-call time).
+        """
+        recognition_config = self._build_recognition_config_dict(
+            model=model,
+            optional_params=optional_params,
+        )
+
+        endpointing_config = self._build_endpointing_config_dict(optional_params)
+        if endpointing_config is not None:
+            recognition_config["endpointing_config"] = endpointing_config
+
+        request_payload: Dict[str, Any] = {
+            "recognition_config": recognition_config,
+            "response_format": optional_params.get("response_format") or "json",
+            "timestamp_granularities": optional_params.get("timestamp_granularities"),
+        }
+
+        return AudioTranscriptionRequestData(data=request_payload, files=None)
+
+    def transform_audio_transcription_response(
+        self,
+        raw_response: Response,
+    ) -> TranscriptionResponse:
+        # Not used: Riva responses come from a gRPC stream, not an httpx
+        # response. The handler calls _build_transcription_response directly.
+        raise NotImplementedError(
+            "NvidiaRivaAudioTranscriptionConfig.transform_audio_transcription_response "
+            "is not used. The handler builds the TranscriptionResponse directly "
+            "from Riva's gRPC streaming results."
+        )
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        # gRPC auth is constructed in the handler, not via HTTP headers.
+        return headers
+
+    def _build_recognition_config_dict(
+        self, model: str, optional_params: dict
+    ) -> Dict[str, Any]:
+        """
+        Build the Riva ``RecognitionConfig`` shape as a plain dict.
+
+        ``model`` is intentionally left empty when the user has not supplied
+        ``riva_model_name``. Riva auto-selects the right deployment from
+        ``language_code`` + ``sample_rate_hertz``. NVIDIA's internal
+        deployment names (e.g. ``parakeet-1.1b-en-US-asr-streaming-...``)
+        change across NIM versions, regions, and self-hosted builds, so
+        hardcoding any name here would break unpredictably.
+        """
+        return {
+            "language_code": optional_params.get("language_code", "en-US"),
+            "sample_rate_hertz": optional_params.get(
+                "sample_rate_hertz", RIVA_TARGET_SAMPLE_RATE_HZ
+            ),
+            "encoding": optional_params.get("encoding", RIVA_TARGET_ENCODING),
+            "audio_channel_count": optional_params.get(
+                "audio_channel_count", RIVA_TARGET_NUM_CHANNELS
+            ),
+            "enable_automatic_punctuation": optional_params.get(
+                "enable_automatic_punctuation", True
+            ),
+            "enable_word_time_offsets": bool(
+                optional_params.get("enable_word_time_offsets", False)
+            ),
+            "max_alternatives": optional_params.get("max_alternatives", 1),
+            "model": optional_params.get("riva_model_name", ""),
+            "verbatim_transcripts": optional_params.get("verbatim_transcripts", False),
+            "profanity_filter": optional_params.get("profanity_filter", False),
+        }
+
+    def _build_endpointing_config_dict(
+        self, optional_params: dict
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Translate an OpenAI-style ``chunking_strategy`` into Riva's
+        ``EndpointingConfig`` shape, or pass through an explicit
+        ``endpointing_config`` dict.
+
+        Returns ``None`` when neither is provided so Riva uses its built-in
+        VAD defaults.
+        """
+        explicit = optional_params.get("endpointing_config")
+        if isinstance(explicit, dict):
+            return dict(explicit)
+
+        chunking = optional_params.get("chunking_strategy")
+        if chunking in (None, "auto"):
+            return None
+
+        if isinstance(chunking, dict) and chunking.get("type") == "server_vad":
+            config: Dict[str, Any] = {}
+            if "threshold" in chunking:
+                threshold = float(chunking["threshold"])
+                config["start_threshold"] = threshold
+                config["stop_threshold"] = threshold
+            if "silence_duration_ms" in chunking:
+                config["stop_history"] = int(chunking["silence_duration_ms"])
+            if "prefix_padding_ms" in chunking:
+                config["stop_history_eou"] = int(chunking["prefix_padding_ms"])
+            return config or None
+
+        return None
+
+    @staticmethod
+    def _normalize_language_code(language: str) -> str:
+        """
+        OpenAI accepts bare ISO-639 codes like ``en``; Riva requires BCP-47
+        like ``en-US``. Normalize the most common bare codes; pass through
+        anything that already looks like BCP-47.
+        """
+        if not isinstance(language, str) or not language:
+            return "en-US"
+        if "-" in language:
+            return language
+        bare_to_bcp47 = {
+            "en": "en-US",
+            "es": "es-ES",
+            "de": "de-DE",
+            "fr": "fr-FR",
+            "it": "it-IT",
+            "pt": "pt-BR",
+            "ja": "ja-JP",
+            "ko": "ko-KR",
+            "zh": "zh-CN",
+            "ru": "ru-RU",
+            "hi": "hi-IN",
+            "ar": "ar-SA",
+        }
+        return bare_to_bcp47.get(language.lower(), language)
+
+    @staticmethod
+    def build_transcription_response(
+        final_results: List[Dict[str, Any]],
+        response_format: str,
+        duration_seconds: Optional[float],
+        timestamp_granularities: Optional[List[str]],
+    ) -> TranscriptionResponse:
+        """
+        Aggregate a list of normalized "final result" dicts into a
+        ``TranscriptionResponse`` shaped for the requested ``response_format``.
+
+        Each entry in ``final_results`` is expected to look like::
+
+            {
+                "transcript": str,
+                "words": [{"word": str, "start_time_ms": int, "end_time_ms": int}, ...],
+            }
+
+        which the handler produces by walking the gRPC stream and keeping
+        only ``result.is_final`` entries (empty/non-final chunks are
+        ignored).
+        """
+        full_transcript = "".join(
+            (item.get("transcript") or "") for item in final_results
+        ).strip()
+
+        response = TranscriptionResponse(text=full_transcript)
+        response["task"] = "transcribe"
+
+        if response_format == "verbose_json":
+            words: List[Dict[str, Any]] = []
+            if timestamp_granularities and "word" in timestamp_granularities:
+                for item in final_results:
+                    for word in item.get("words", []) or []:
+                        words.append(
+                            {
+                                "word": word.get("word", ""),
+                                "start": (float(word.get("start_time_ms", 0)) / 1000.0),
+                                "end": float(word.get("end_time_ms", 0)) / 1000.0,
+                            }
+                        )
+            if words:
+                response["words"] = words
+            if duration_seconds is not None:
+                response["duration"] = duration_seconds
+
+        return response
@@ -0,0 +1,92 @@
+"""
+Common utilities and exceptions for the NVIDIA Riva STT provider
+"""
+
+from typing import Any, Optional
+
+from litellm.llms.base_llm.chat.transformation import BaseLLMException
+
+
+class NvidiaRivaException(BaseLLMException):
+    """
+    Exception raised for NVIDIA Riva (gRPC) errors.
+
+    ``status_code`` is an HTTP-equivalent code derived from the underlying
+    gRPC ``StatusCode`` (when available) so that litellm's existing error
+    classifiers (RateLimitError, AuthenticationError, etc.) keep working.
+    """
+
+    pass
+
+
+# Mapping from grpc.StatusCode.name -> equivalent HTTP status code.
+# Kept as a plain dict (rather than importing grpc enums) so this module is
+# importable without grpc installed.
+_GRPC_STATUS_CODE_TO_HTTP: dict = {
+    "OK": 200,
+    "CANCELLED": 499,
+    "UNKNOWN": 500,
+    "INVALID_ARGUMENT": 400,
+    "DEADLINE_EXCEEDED": 504,
+    "NOT_FOUND": 404,
+    "ALREADY_EXISTS": 409,
+    "PERMISSION_DENIED": 403,
+    "RESOURCE_EXHAUSTED": 429,
+    "FAILED_PRECONDITION": 400,
+    "ABORTED": 409,
+    "OUT_OF_RANGE": 400,
+    "UNIMPLEMENTED": 501,
+    "INTERNAL": 500,
+    "UNAVAILABLE": 503,
+    "DATA_LOSS": 500,
+    "UNAUTHENTICATED": 401,
+}
+
+
+def _extract_grpc_status_name(error: Any) -> Optional[str]:
+    """
+    Best-effort extraction of a gRPC StatusCode name from an arbitrary error.
+
+    Works for ``grpc.RpcError`` instances (which expose ``.code()``) as well
+    as plain exceptions whose string representation contains a status name.
+    """
+    code_fn = getattr(error, "code", None)
+    if callable(code_fn):
+        try:
+            code = code_fn()
+        except Exception:
+            code = None
+        name = getattr(code, "name", None)
+        if isinstance(name, str):
+            return name
+    return None
+
+
+def _extract_grpc_details(error: Any) -> Optional[str]:
+    """Best-effort extraction of a human-readable detail string from a gRPC error."""
+    details_fn = getattr(error, "details", None)
+    if callable(details_fn):
+        try:
+            details = details_fn()
+        except Exception:
+            details = None
+        if isinstance(details, str) and details:
+            return details
+    return None
+
+
+def grpc_error_to_litellm_exception(error: Exception) -> NvidiaRivaException:
+    """
+    Convert a gRPC error (or any exception raised from the Riva client) into
+    a ``NvidiaRivaException`` with an appropriate HTTP-equivalent status code.
+    """
+    status_name = _extract_grpc_status_name(error)
+    http_status = _GRPC_STATUS_CODE_TO_HTTP.get(status_name or "", 500)
+
+    detail = _extract_grpc_details(error) or str(error)
+    message = (
+        f"NVIDIA Riva gRPC error ({status_name}): {detail}"
+        if status_name
+        else f"NVIDIA Riva error: {detail}"
+    )
+    return NvidiaRivaException(status_code=http_status, message=message)
@@ -211,6 +211,12 @@ from .llms.oobabooga.chat import oobabooga
 from .llms.openai.completion.handler import OpenAITextCompletion
 from .llms.openai.image_variations.handler import OpenAIImageVariationsHandler
 from .llms.openai.openai import OpenAIChatCompletion
+from .llms.nvidia_riva.audio_transcription.handler import (
+    NvidiaRivaAudioTranscription,
+)
+from .llms.nvidia_riva.audio_transcription.transformation import (
+    NvidiaRivaAudioTranscriptionConfig,
+)
 from .llms.openai.transcriptions.handler import OpenAIAudioTranscription
 from .llms.openai_like.chat.handler import OpenAILikeChatHandler
 from .llms.openai_like.embedding.handler import OpenAILikeEmbeddingHandler
@@ -266,6 +272,7 @@ from .types.utils import (
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
+nvidia_riva_audio_transcriptions = NvidiaRivaAudioTranscription()
 openai_image_variations = OpenAIImageVariationsHandler()
 groq_chat_completions = GroqChatCompletion()
 sap_gen_ai_hub_chat_completions = GenAIHubOrchestration()
@@ -6605,6 +6612,26 @@ def transcription(
            litellm_params=litellm_params_dict,
            shared_session=shared_session,
        )
+    elif custom_llm_provider == "nvidia_riva":
+        # NVIDIA Riva is gRPC-based, not HTTP. It has its own dedicated handler
+        # rather than going through base_llm_http_handler.
+        response = nvidia_riva_audio_transcriptions.audio_transcriptions(
+            model=model,
+            audio_file=file,
+            optional_params=optional_params,
+            litellm_params=litellm_params_dict,
+            model_response=model_response,
+            atranscription=atranscription,
+            timeout=timeout,
+            logging_obj=litellm_logging_obj,
+            api_base=api_base,
+            api_key=api_key,
+            provider_config=(
+                provider_config
+                if isinstance(provider_config, NvidiaRivaAudioTranscriptionConfig)
+                else None
+            ),
+        )
    elif provider_config is not None:
        response = base_llm_http_handler.audio_transcriptions(
            model=model,
@@ -3247,6 +3247,7 @@ class LlmProviders(str, Enum):
    A2A = "a2a"
    GIGACHAT = "gigachat"
    NVIDIA_NIM = "nvidia_nim"
+    NVIDIA_RIVA = "nvidia_riva"
    CEREBRAS = "cerebras"
    AI21_CHAT = "ai21_chat"
    VOLCENGINE = "volcengine"
@@ -8545,6 +8545,12 @@ class ProviderConfigManager:
            )

            return MistralAudioTranscriptionConfig()
+        elif litellm.LlmProviders.NVIDIA_RIVA == provider:
+            from litellm.llms.nvidia_riva.audio_transcription.transformation import (
+                NvidiaRivaAudioTranscriptionConfig,
+            )
+
+            return NvidiaRivaAudioTranscriptionConfig()
        return None

    @staticmethod
@@ -1610,6 +1610,22 @@
        "interactions": true
      }
    },
+    "nvidia_riva": {
+      "display_name": "Nvidia Riva (`nvidia_riva`)",
+      "url": "https://docs.litellm.ai/docs/providers/nvidia_riva",
+      "endpoints": {
+        "chat_completions": false,
+        "messages": false,
+        "responses": false,
+        "embeddings": false,
+        "image_generations": false,
+        "audio_transcriptions": true,
+        "audio_speech": false,
+        "moderations": false,
+        "batches": false,
+        "rerank": false
+      }
+    },
    "oci": {
      "display_name": "OCI (`oci`)",
      "url": "https://docs.litellm.ai/docs/providers/oci",
@@ -86,6 +86,14 @@ grpc = [
    # Newest non-yanked release older than the 30-day cutoff.
    "grpcio==1.78.0",
 ]
+stt-nvidia-riva = [
+    # NVIDIA Riva STT provider (gRPC). These are imported lazily inside the
+    # provider handler so litellm core remains usable without them.
+    "nvidia-riva-client>=2.15.0",
+    "soundfile>=0.12.1",
+    "audioread>=3.0.1",
+    "numpy>=1.26.0",
+]
 google = ["google-cloud-aiplatform==1.133.0"]
 proxy-runtime = [
    # Historically bundled in the proxy Docker images via requirements.txt.
@@ -126,6 +126,7 @@ sentry_sdk: >=2.21.0 # Unknown license
 cryptography: >=43.0.1 # Unknown license
 tzdata: >=2025.1 # Unknown license
 urllib3: >=2.0.0 # MIT license - https://github.com/urllib3/urllib3
+audioread: >=3.0.1 # MIT license manually verified - https://github.com/beetbox/audioread
 python-dotenv: >=1.0.0 # Unknown license
 tiktoken: >=0.8.0 # Unknown license
 click: >=8.1.7 # Unknown license
@@ -0,0 +1,130 @@
+"""
+Tests for the NVIDIA Riva audio resampling utility.
+
+The resampler turns arbitrary inbound audio (mp3/wav/m4a/...) into the wire
+format Riva's gRPC ASR expects: 16 kHz mono LINEAR_PCM (int16 LE).
+"""
+
+import io
+import os
+import sys
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import soundfile as sf
+
+sys.path.insert(0, os.path.abspath("../../../../.."))
+
+from litellm.llms.nvidia_riva.audio_transcription.audio_utils import (
+    resample_to_riva_pcm,
+)
+from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException
+
+
+def _wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes:
+    buf = io.BytesIO()
+    sf.write(buf, samples, sample_rate, format="WAV", subtype="PCM_16")
+    return buf.getvalue()
+
+
+def test_resample_24khz_stereo_to_16khz_mono_int16():
+    sample_rate_in = 24000
+    duration_seconds = 1.0
+    n = int(sample_rate_in * duration_seconds)
+    t = np.linspace(0, duration_seconds, n, endpoint=False)
+    left = 0.5 * np.sin(2 * np.pi * 440.0 * t)
+    right = 0.5 * np.sin(2 * np.pi * 660.0 * t)
+    stereo = np.stack([left, right], axis=1).astype(np.float32)
+
+    wav_in = _wav_bytes(stereo, sample_rate_in)
+
+    resampled = resample_to_riva_pcm(wav_in)
+
+    assert resampled.sample_rate_hz == 16000
+    assert resampled.num_channels == 1
+    # int16 = 2 bytes per sample
+    expected_samples = int(round(duration_seconds * 16000))
+    assert len(resampled.pcm_bytes) == expected_samples * 2
+    assert resampled.duration_seconds == pytest.approx(duration_seconds, abs=0.005)
+
+
+def test_resample_16khz_mono_passes_through_int16_bytes_match_length():
+    sample_rate = 16000
+    n = sample_rate
+    samples = (0.1 * np.sin(np.linspace(0, 2 * np.pi * 200, n))).astype(np.float32)
+    wav_in = _wav_bytes(samples, sample_rate)
+
+    resampled = resample_to_riva_pcm(wav_in)
+
+    assert resampled.sample_rate_hz == 16000
+    assert len(resampled.pcm_bytes) == n * 2
+    assert resampled.duration_seconds == pytest.approx(1.0, abs=0.001)
+
+
+def test_resample_preserves_int16_clip_range():
+    sample_rate = 16000
+    samples = np.array([2.0, -2.0, 0.0, 1.0], dtype=np.float32)
+    wav_in = _wav_bytes(samples, sample_rate)
+
+    resampled = resample_to_riva_pcm(wav_in)
+
+    decoded = np.frombuffer(resampled.pcm_bytes, dtype="<i2")
+    # Anything outside [-1, 1] should clip to int16 boundary.
+    assert decoded.max() <= 32767
+    assert decoded.min() >= -32767
+
+
+def test_unknown_format_raises_clear_error():
+    # 4 random bytes are not valid audio in any container we can decode.
+    with pytest.raises(NvidiaRivaException) as excinfo:
+        resample_to_riva_pcm(b"\x00\x01\x02\x03")
+    # Message must hint at what to do next.
+    assert "Riva STT" in excinfo.value.message
+
+
+def test_audioread_fallback_writes_to_tempfile_path(monkeypatch):
+    """
+    The audioread fallback handles compressed formats (mp3, m4a, ...). Most
+    audioread backends call into a subprocess (FFmpeg, GStreamer) and
+    require a real filesystem path — passing a BytesIO blows up with a
+    TypeError in subprocess.Popen. This test would have caught that bug:
+    we assert ``audio_open`` is called with a string path that points at a
+    file containing exactly the input bytes.
+    """
+    payload = b"\xff\xfbfake-mp3-bytes-not-actually-decodable"
+    seen_paths = []
+
+    class FakeAudioSource:
+        samplerate = 22050
+        channels = 1
+
+        def __iter__(self):
+            yield np.array([0, 0, 0, 0], dtype=np.int16).tobytes()
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *args):
+            return False
+
+    def fake_audio_open(path):
+        assert isinstance(path, str), "audioread requires a filesystem path"
+        seen_paths.append(path)
+        with open(path, "rb") as fh:
+            assert fh.read() == payload
+        return FakeAudioSource()
+
+    fake_audioread = SimpleNamespace(audio_open=fake_audio_open)
+    monkeypatch.setitem(sys.modules, "audioread", fake_audioread)
+
+    fake_sf = MagicMock()
+    fake_sf.read.side_effect = RuntimeError("libsndfile cannot decode mp3")
+    monkeypatch.setitem(sys.modules, "soundfile", fake_sf)
+
+    resampled = resample_to_riva_pcm(payload)
+    assert resampled.sample_rate_hz == 16000
+    assert seen_paths and seen_paths[0].endswith(".audio")
+    # Tempfile must be cleaned up after decode.
+    assert not os.path.exists(seen_paths[0])
@@ -0,0 +1,419 @@
+"""
+End-to-end-ish tests for NvidiaRivaAudioTranscription.
+
+We mock ``riva.client`` so the test does not need the real gRPC SDK or a
+running Riva server. The mock also lets us assert how Auth metadata is
+constructed (NVCF vs self-hosted) and how the streaming generator output
+is aggregated.
+"""
+
+import asyncio
+import io
+import os
+import sys
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import soundfile as sf
+
+sys.path.insert(0, os.path.abspath("../../../../.."))
+
+from litellm.llms.nvidia_riva.audio_transcription import handler as handler_mod
+from litellm.llms.nvidia_riva.audio_transcription.handler import (
+    NvidiaRivaAudioTranscription,
+)
+from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException
+from litellm.types.utils import TranscriptionResponse
+
+
+def _make_wav_bytes(seconds: float = 1.0, sample_rate: int = 16000) -> bytes:
+    n = int(sample_rate * seconds)
+    samples = (0.05 * np.sin(np.linspace(0, 2 * np.pi * 220 * seconds, n))).astype(
+        np.float32
+    )
+    buf = io.BytesIO()
+    sf.write(buf, samples, sample_rate, format="WAV", subtype="PCM_16")
+    return buf.getvalue()
+
+
+def _fake_word(word: str, start_ms: int, end_ms: int):
+    return SimpleNamespace(word=word, start_time=start_ms, end_time=end_ms)
+
+
+def _fake_alternative(transcript: str, words=None):
+    return SimpleNamespace(transcript=transcript, words=words or [])
+
+
+def _fake_result(is_final: bool, alternatives):
+    return SimpleNamespace(is_final=is_final, alternatives=alternatives)
+
+
+def _fake_response(results):
+    return SimpleNamespace(results=results)
+
+
+@pytest.fixture
+def mock_riva(monkeypatch):
+    """
+    Stand-ins for the bits of ``riva.client`` the handler touches:
+    - ``Auth`` (constructor)
+    - ``ASRService`` with ``streaming_response_generator``
+    - ``RecognitionConfig``, ``StreamingRecognitionConfig``, ``EndpointingConfig``
+    - ``AudioEncoding`` namespace with ``LINEAR_PCM``
+    """
+    auth_calls = {}
+
+    class FakeAuth:
+        def __init__(self, *args, **kwargs):
+            # Support both keyword and positional Auth constructors.
+            if kwargs:
+                auth_calls["uri"] = kwargs.get("uri")
+                auth_calls["use_ssl"] = kwargs.get("use_ssl")
+                auth_calls["metadata_args"] = kwargs.get("metadata_args")
+            else:
+                # positional: (None, use_ssl, uri, metadata)
+                auth_calls["use_ssl"] = args[1] if len(args) > 1 else None
+                auth_calls["uri"] = args[2] if len(args) > 2 else None
+                auth_calls["metadata_args"] = args[3] if len(args) > 3 else None
+
+    class FakeRecognitionConfig:
+        def __init__(self, **kwargs):
+            self._kwargs = kwargs
+            self.endpointing_config = SimpleNamespace(CopyFrom=lambda _: None)
+
+    class FakeStreamingRecognitionConfig:
+        def __init__(self, config, interim_results):
+            self.config = config
+            self.interim_results = interim_results
+
+    class FakeEndpointingConfig:
+        def __init__(self, **kwargs):
+            self._kwargs = kwargs
+
+    class FakeAudioEncoding:
+        LINEAR_PCM = "LINEAR_PCM"
+
+    streaming_responses_holder = {"value": []}
+
+    class FakeASRService:
+        def __init__(self, auth):
+            self.auth = auth
+
+        def streaming_response_generator(self, audio_chunks, streaming_config):
+            # Drain audio_chunks generator so we exercise the chunking path.
+            list(audio_chunks)
+            yield from streaming_responses_holder["value"]
+
+    fake_riva_client = SimpleNamespace(
+        Auth=FakeAuth,
+        ASRService=FakeASRService,
+        RecognitionConfig=FakeRecognitionConfig,
+        StreamingRecognitionConfig=FakeStreamingRecognitionConfig,
+        EndpointingConfig=FakeEndpointingConfig,
+        AudioEncoding=FakeAudioEncoding,
+    )
+
+    def fake_import_riva():
+        return fake_riva_client, fake_riva_client
+
+    monkeypatch.setattr(handler_mod, "_import_riva", fake_import_riva)
+
+    return SimpleNamespace(
+        auth_calls=auth_calls,
+        responses=streaming_responses_holder,
+        client=fake_riva_client,
+    )
+
+
+@pytest.fixture
+def logging_obj():
+    return MagicMock()
+
+
+def test_sync_path_aggregates_only_final_results(mock_riva, logging_obj):
+    mock_riva.responses["value"] = [
+        # Empty heartbeat chunk: ignore.
+        _fake_response(results=[]),
+        # Interim chunk (not final): ignore.
+        _fake_response(
+            results=[
+                _fake_result(
+                    is_final=False, alternatives=[_fake_alternative("partial...")]
+                )
+            ]
+        ),
+        # Two final chunks aggregated.
+        _fake_response(
+            results=[
+                _fake_result(
+                    is_final=True,
+                    alternatives=[
+                        _fake_alternative(
+                            "Hello,",
+                            words=[_fake_word("Hello,", 0, 320)],
+                        )
+                    ],
+                )
+            ]
+        ),
+        _fake_response(
+            results=[
+                _fake_result(
+                    is_final=True,
+                    alternatives=[
+                        _fake_alternative(
+                            " world.",
+                            words=[_fake_word("world.", 480, 870)],
+                        )
+                    ],
+                )
+            ]
+        ),
+    ]
+
+    impl = NvidiaRivaAudioTranscription()
+    response: TranscriptionResponse = impl.audio_transcriptions(
+        model="nvidia/parakeet-ctc-1_1b-asr",
+        audio_file=_make_wav_bytes(),
+        optional_params={
+            "language_code": "en-US",
+            "enable_word_time_offsets": True,
+            "response_format": "verbose_json",
+            "timestamp_granularities": ["word"],
+        },
+        litellm_params={},
+        model_response=TranscriptionResponse(),
+        timeout=60,
+        logging_obj=logging_obj,
+        api_key="nvapi-xxx",
+        api_base="grpc.nvcf.nvidia.com:443",
+    )
+
+    assert response.text == "Hello, world."
+    # duration is propagated from the resampler.
+    assert response._hidden_params["audio_transcription_duration"] == pytest.approx(
+        1.0, abs=0.05
+    )
+    # word timestamps converted from ms to seconds.
+    words = response["words"]
+    assert words[0]["start"] == pytest.approx(0.0)
+    assert words[1]["end"] == pytest.approx(0.87)
+    assert (
+        logging_obj.pre_call.call_args.kwargs["additional_args"]["atranscription"]
+        is False
+    )
+
+
+def test_auth_nvcf_defaults_use_ssl_and_attaches_function_id(mock_riva, logging_obj):
+    mock_riva.responses["value"] = [
+        _fake_response(
+            results=[
+                _fake_result(
+                    is_final=True,
+                    alternatives=[_fake_alternative("ok")],
+                )
+            ]
+        )
+    ]
+    impl = NvidiaRivaAudioTranscription()
+    impl.audio_transcriptions(
+        model="m",
+        audio_file=_make_wav_bytes(),
+        optional_params={
+            "nvcf_function_id": "abc-123",
+            "language_code": "en-US",
+        },
+        litellm_params={},
+        model_response=TranscriptionResponse(),
+        timeout=60,
+        logging_obj=logging_obj,
+        api_key="nvapi-xxx",
+        api_base="grpc.nvcf.nvidia.com:443",
+    )
+
+    assert mock_riva.auth_calls["uri"] == "grpc.nvcf.nvidia.com:443"
+    assert mock_riva.auth_calls["use_ssl"] is True
+    metadata = dict(mock_riva.auth_calls["metadata_args"])
+    assert metadata["function-id"] == "abc-123"
+    assert metadata["authorization"] == "Bearer nvapi-xxx"
+
+
+def test_auth_self_hosted_defaults_no_ssl_and_no_function_id(mock_riva, logging_obj):
+    mock_riva.responses["value"] = [
+        _fake_response(
+            results=[
+                _fake_result(is_final=True, alternatives=[_fake_alternative("ok")])
+            ]
+        )
+    ]
+    impl = NvidiaRivaAudioTranscription()
+    impl.audio_transcriptions(
+        model="m",
+        audio_file=_make_wav_bytes(),
+        optional_params={"language_code": "en-US"},
+        litellm_params={},
+        model_response=TranscriptionResponse(),
+        timeout=60,
+        logging_obj=logging_obj,
+        api_key=None,
+        api_base="localhost:50051",
+    )
+
+    assert mock_riva.auth_calls["uri"] == "localhost:50051"
+    assert mock_riva.auth_calls["use_ssl"] is False
+    metadata = dict(mock_riva.auth_calls["metadata_args"])
+    # No function-id, no authorization metadata.
+    assert "function-id" not in metadata
+    assert "authorization" not in metadata
+
+
+def test_explicit_use_ssl_override_wins(mock_riva, logging_obj):
+    """
+    Self-hosted Riva behind an ingress with TLS termination is a real
+    deployment topology. ``use_ssl=True`` must be honored even without an
+    NVCF function id.
+    """
+    mock_riva.responses["value"] = [
+        _fake_response(
+            results=[
+                _fake_result(is_final=True, alternatives=[_fake_alternative("ok")])
+            ]
+        )
+    ]
+    impl = NvidiaRivaAudioTranscription()
+    impl.audio_transcriptions(
+        model="m",
+        audio_file=_make_wav_bytes(),
+        optional_params={"use_ssl": True, "language_code": "en-US"},
+        litellm_params={},
+        model_response=TranscriptionResponse(),
+        timeout=60,
+        logging_obj=logging_obj,
+        api_key=None,
+        api_base="riva.internal.company.com:443",
+    )
+
+    assert mock_riva.auth_calls["use_ssl"] is True
+
+
+def test_missing_api_base_raises_clear_error(mock_riva, logging_obj):
+    impl = NvidiaRivaAudioTranscription()
+    with pytest.raises(NvidiaRivaException) as excinfo:
+        impl.audio_transcriptions(
+            model="m",
+            audio_file=_make_wav_bytes(),
+            optional_params={},
+            litellm_params={},
+            model_response=TranscriptionResponse(),
+            timeout=60,
+            logging_obj=logging_obj,
+            api_key=None,
+            api_base=None,
+        )
+    assert "api_base" in excinfo.value.message
+
+
+def test_async_path_uses_to_thread(mock_riva, logging_obj):
+    mock_riva.responses["value"] = [
+        _fake_response(
+            results=[
+                _fake_result(
+                    is_final=True, alternatives=[_fake_alternative("async ok")]
+                )
+            ]
+        )
+    ]
+    impl = NvidiaRivaAudioTranscription()
+    response = asyncio.run(
+        impl.async_audio_transcriptions(
+            model="m",
+            audio_file=_make_wav_bytes(),
+            optional_params={"language_code": "en-US"},
+            litellm_params={},
+            model_response=TranscriptionResponse(),
+            timeout=60,
+            logging_obj=logging_obj,
+            api_key=None,
+            api_base="localhost:50051",
+        )
+    )
+    assert response.text == "async ok"
+    assert (
+        logging_obj.pre_call.call_args.kwargs["additional_args"]["atranscription"]
+        is True
+    )
+
+
+def test_timeout_is_forwarded_to_streaming_generator_when_supported(
+    mock_riva, logging_obj
+):
+    """
+    Without a deadline the gRPC stream can block forever on a stalled Riva
+    server. The handler must forward the call-level ``timeout`` to
+    ``streaming_response_generator`` whenever the installed riva-client
+    accepts a ``timeout`` kwarg.
+    """
+    captured_kwargs = {}
+
+    def streaming_with_timeout(self, audio_chunks, streaming_config, timeout=None):
+        captured_kwargs["timeout"] = timeout
+        list(audio_chunks)
+        yield from [
+            _fake_response(
+                results=[
+                    _fake_result(is_final=True, alternatives=[_fake_alternative("ok")])
+                ]
+            )
+        ]
+
+    mock_riva.client.ASRService.streaming_response_generator = streaming_with_timeout
+
+    impl = NvidiaRivaAudioTranscription()
+    impl.audio_transcriptions(
+        model="m",
+        audio_file=_make_wav_bytes(),
+        optional_params={"language_code": "en-US"},
+        litellm_params={},
+        model_response=TranscriptionResponse(),
+        timeout=12.5,
+        logging_obj=logging_obj,
+        api_key=None,
+        api_base="localhost:50051",
+    )
+    assert captured_kwargs["timeout"] == pytest.approx(12.5)
+
+
+def test_grpc_error_is_wrapped_as_nvidia_riva_exception(mock_riva, logging_obj):
+    class FakeGrpcError(Exception):
+        def code(self):
+            return SimpleNamespace(name="UNAUTHENTICATED")
+
+        def details(self):
+            return "bad token"
+
+    def raising_streaming_response_generator(self, audio_chunks, streaming_config):
+        list(audio_chunks)
+        raise FakeGrpcError("rpc fail")
+
+    mock_riva.client.ASRService.streaming_response_generator = (
+        raising_streaming_response_generator
+    )
+
+    impl = NvidiaRivaAudioTranscription()
+    with pytest.raises(NvidiaRivaException) as excinfo:
+        impl.audio_transcriptions(
+            model="m",
+            audio_file=_make_wav_bytes(),
+            optional_params={"language_code": "en-US"},
+            litellm_params={},
+            model_response=TranscriptionResponse(),
+            timeout=60,
+            logging_obj=logging_obj,
+            api_key="nvapi-xxx",
+            api_base="grpc.nvcf.nvidia.com:443",
+        )
+
+    assert excinfo.value.status_code == 401
+    assert "UNAUTHENTICATED" in excinfo.value.message
@@ -0,0 +1,275 @@
+"""
+Unit tests for NvidiaRivaAudioTranscriptionConfig.
+
+These tests do not require ``nvidia-riva-client`` or any audio libs to be
+installed; the transformation layer is intentionally pure-Python on dicts.
+"""
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.abspath("../../../../.."))
+
+from litellm.llms.base_llm.audio_transcription.transformation import (
+    AudioTranscriptionRequestData,
+)
+from litellm.llms.nvidia_riva.audio_transcription.transformation import (
+    NvidiaRivaAudioTranscriptionConfig,
+)
+from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException
+
+
+@pytest.fixture
+def cfg():
+    return NvidiaRivaAudioTranscriptionConfig()
+
+
+def test_supported_openai_params(cfg):
+    params = cfg.get_supported_openai_params(model="nvidia/parakeet-ctc-1_1b-asr")
+    assert "language" in params
+    assert "response_format" in params
+    assert "timestamp_granularities" in params
+
+
+def test_map_language_normalizes_bare_codes(cfg):
+    out = cfg.map_openai_params(
+        non_default_params={"language": "en"},
+        optional_params={},
+        model="m",
+        drop_params=False,
+    )
+    assert out["language_code"] == "en-US"
+
+
+def test_map_language_passes_through_bcp47(cfg):
+    out = cfg.map_openai_params(
+        non_default_params={"language": "de-DE"},
+        optional_params={},
+        model="m",
+        drop_params=False,
+    )
+    assert out["language_code"] == "de-DE"
+
+
+def test_map_language_es_defaults_to_castilian_spain(cfg):
+    """
+    Bare ``es`` is ISO-639 Spanish; in BCP-47 it conventionally resolves to
+    es-ES (Castilian / Spain), not es-US. Routing every Spanish caller to a
+    US-tuned Riva model would silently degrade accuracy.
+    """
+    out = cfg.map_openai_params(
+        non_default_params={"language": "es"},
+        optional_params={},
+        model="m",
+        drop_params=False,
+    )
+    assert out["language_code"] == "es-ES"
+
+
+def test_map_timestamp_granularities_word_enables_word_offsets(cfg):
+    out = cfg.map_openai_params(
+        non_default_params={"timestamp_granularities": ["word"]},
+        optional_params={},
+        model="m",
+        drop_params=False,
+    )
+    assert out["enable_word_time_offsets"] is True
+    assert out["timestamp_granularities"] == ["word"]
+
+
+def test_map_timestamp_granularities_segment_only_does_not_enable_word_offsets(cfg):
+    out = cfg.map_openai_params(
+        non_default_params={"timestamp_granularities": ["segment"]},
+        optional_params={},
+        model="m",
+        drop_params=False,
+    )
+    assert "enable_word_time_offsets" not in out
+
+
+def test_transform_request_builds_recognition_config(cfg):
+    result = cfg.transform_audio_transcription_request(
+        model="nvidia/parakeet-ctc-1_1b-asr",
+        audio_file=b"fake-audio",
+        optional_params={
+            "language_code": "en-US",
+            "enable_word_time_offsets": True,
+            "nvcf_function_id": "abc-123",
+            "use_ssl": True,
+            "riva_model_name": "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer",
+        },
+        litellm_params={
+            "api_base": "grpc.nvcf.nvidia.com:443",
+            "api_key": "nvapi-xxx",
+        },
+    )
+
+    assert isinstance(result, AudioTranscriptionRequestData)
+    payload = result.data
+    assert payload["recognition_config"]["language_code"] == "en-US"
+    assert payload["recognition_config"]["sample_rate_hertz"] == 16000
+    assert payload["recognition_config"]["audio_channel_count"] == 1
+    assert payload["recognition_config"]["encoding"] == "LINEAR_PCM"
+    assert payload["recognition_config"]["enable_word_time_offsets"] is True
+    assert (
+        payload["recognition_config"]["model"]
+        == "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer"
+    )
+    assert "audio_file" not in payload
+    assert "auth" not in payload
+
+
+def test_transform_request_default_riva_model_is_empty_for_auto_select(cfg):
+    """
+    Riva auto-selects the deployed model when ``model`` is empty. This is
+    the right default because internal NVIDIA deployment names change
+    across versions/regions.
+    """
+    result = cfg.transform_audio_transcription_request(
+        model="nvidia/parakeet-ctc-1_1b-asr",
+        audio_file=b"fake-audio",
+        optional_params={"language_code": "en-US"},
+        litellm_params={"api_base": "grpc.nvcf.nvidia.com:443"},
+    )
+    assert result.data["recognition_config"]["model"] == ""
+
+
+def test_chunking_strategy_server_vad_maps_to_endpointing_config(cfg):
+    result = cfg.transform_audio_transcription_request(
+        model="m",
+        audio_file=b"x",
+        optional_params={
+            "chunking_strategy": {
+                "type": "server_vad",
+                "threshold": 0.5,
+                "silence_duration_ms": 700,
+                "prefix_padding_ms": 250,
+            }
+        },
+        litellm_params={"api_base": "localhost:50051"},
+    )
+    ep = result.data["recognition_config"].get("endpointing_config")
+    assert ep is not None
+    assert ep["start_threshold"] == 0.5
+    assert ep["stop_threshold"] == 0.5
+    assert ep["stop_history"] == 700
+    assert ep["stop_history_eou"] == 250
+
+
+def test_chunking_strategy_auto_leaves_endpointing_config_unset(cfg):
+    result = cfg.transform_audio_transcription_request(
+        model="m",
+        audio_file=b"x",
+        optional_params={"chunking_strategy": "auto"},
+        litellm_params={"api_base": "localhost:50051"},
+    )
+    assert "endpointing_config" not in result.data["recognition_config"]
+
+
+def test_explicit_endpointing_config_pass_through(cfg):
+    result = cfg.transform_audio_transcription_request(
+        model="m",
+        audio_file=b"x",
+        optional_params={
+            "endpointing_config": {"stop_history": 1200, "start_threshold": 0.3}
+        },
+        litellm_params={"api_base": "localhost:50051"},
+    )
+    ep = result.data["recognition_config"]["endpointing_config"]
+    assert ep == {"stop_history": 1200, "start_threshold": 0.3}
+
+
+def test_build_transcription_response_text_format():
+    final_results = [
+        {"transcript": "Hello,", "words": []},
+        {"transcript": " this is parakeet.", "words": []},
+    ]
+    response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response(
+        final_results=final_results,
+        response_format="json",
+        duration_seconds=2.4,
+        timestamp_granularities=None,
+    )
+    assert response.text == "Hello, this is parakeet."
+    assert response["task"] == "transcribe"
+    # duration is only attached for verbose_json
+    assert "duration" not in response
+
+
+def test_build_transcription_response_skips_empty_chunks():
+    final_results = [
+        {"transcript": "", "words": []},
+        {"transcript": "actual content", "words": []},
+        {"transcript": "", "words": []},
+    ]
+    response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response(
+        final_results=final_results,
+        response_format="json",
+        duration_seconds=1.0,
+        timestamp_granularities=None,
+    )
+    assert response.text == "actual content"
+
+
+def test_build_transcription_response_verbose_json_with_words():
+    final_results = [
+        {
+            "transcript": "Hello,",
+            "words": [
+                {"word": "Hello,", "start_time_ms": 0, "end_time_ms": 320},
+            ],
+        },
+        {
+            "transcript": " world.",
+            "words": [
+                {"word": "world.", "start_time_ms": 480, "end_time_ms": 870},
+            ],
+        },
+    ]
+    response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response(
+        final_results=final_results,
+        response_format="verbose_json",
+        duration_seconds=2.475,
+        timestamp_granularities=["word"],
+    )
+
+    assert response.text == "Hello, world."
+    assert response["duration"] == 2.475
+    words = response["words"]
+    assert words[0]["word"] == "Hello,"
+    # Riva returns ms; OpenAI exposes seconds.
+    assert words[0]["start"] == pytest.approx(0.0)
+    assert words[0]["end"] == pytest.approx(0.32)
+    assert words[1]["start"] == pytest.approx(0.48)
+    assert words[1]["end"] == pytest.approx(0.87)
+
+
+def test_build_transcription_response_verbose_json_without_word_granularity_omits_words():
+    final_results = [
+        {
+            "transcript": "Hi.",
+            "words": [
+                {"word": "Hi.", "start_time_ms": 0, "end_time_ms": 200},
+            ],
+        }
+    ]
+    response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response(
+        final_results=final_results,
+        response_format="verbose_json",
+        duration_seconds=0.2,
+        timestamp_granularities=["segment"],
+    )
+    assert "words" not in response
+
+
+def test_transform_response_not_used_raises_clear_error(cfg):
+    with pytest.raises(NotImplementedError):
+        cfg.transform_audio_transcription_response(raw_response=None)  # type: ignore[arg-type]
+
+
+def test_get_error_class_returns_nvidia_riva_exception(cfg):
+    err = cfg.get_error_class(error_message="bad", status_code=401, headers={})
+    assert isinstance(err, NvidiaRivaException)
+    assert err.status_code == 401
@@ -9,7 +9,7 @@ resolution-markers = [
 ]

 [options]
-exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
+exclude-newer = "2026-05-02T11:18:44.200141Z"
 exclude-newer-span = "P3D"

 [manifest]
@@ -339,6 +339,59 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" },
 ]

+[[package]]
+name = "audioop-lts"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/38/53/946db57842a50b2da2e0c1e34bd37f36f5aadba1a929a3971c5d7841dbca/audioop_lts-0.2.2.tar.gz", hash = "sha256:64d0c62d88e67b98a1a5e71987b7aa7b5bcffc7dcee65b635823dbdd0a8dbbd0", size = 30686, upload-time = "2025-08-05T16:43:17.409Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/d4/94d277ca941de5a507b07f0b592f199c22454eeaec8f008a286b3fbbacd6/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd3d4602dc64914d462924a08c1a9816435a2155d74f325853c1f1ac3b2d9800", size = 46523, upload-time = "2025-08-05T16:42:20.836Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/5a/656d1c2da4b555920ce4177167bfeb8623d98765594af59702c8873f60ec/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:550c114a8df0aafe9a05442a1162dfc8fec37e9af1d625ae6060fed6e756f303", size = 27455, upload-time = "2025-08-05T16:42:22.283Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/83/ea581e364ce7b0d41456fb79d6ee0ad482beda61faf0cab20cbd4c63a541/audioop_lts-0.2.2-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:9a13dc409f2564de15dd68be65b462ba0dde01b19663720c68c1140c782d1d75", size = 26997, upload-time = "2025-08-05T16:42:23.849Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/3b/e8964210b5e216e5041593b7d33e97ee65967f17c282e8510d19c666dab4/audioop_lts-0.2.2-cp313-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:51c916108c56aa6e426ce611946f901badac950ee2ddaf302b7ed35d9958970d", size = 85844, upload-time = "2025-08-05T16:42:25.208Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/2e/0a1c52faf10d51def20531a59ce4c706cb7952323b11709e10de324d6493/audioop_lts-0.2.2-cp313-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:47eba38322370347b1c47024defbd36374a211e8dd5b0dcbce7b34fdb6f8847b", size = 85056, upload-time = "2025-08-05T16:42:26.559Z" },
+    { url = "https://files.pythonhosted.org/packages/75/e8/cd95eef479656cb75ab05dfece8c1f8c395d17a7c651d88f8e6e291a63ab/audioop_lts-0.2.2-cp313-abi3-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba7c3a7e5f23e215cb271516197030c32aef2e754252c4c70a50aaff7031a2c8", size = 93892, upload-time = "2025-08-05T16:42:27.902Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/1e/a0c42570b74f83efa5cca34905b3eef03f7ab09fe5637015df538a7f3345/audioop_lts-0.2.2-cp313-abi3-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:def246fe9e180626731b26e89816e79aae2276f825420a07b4a647abaa84becc", size = 96660, upload-time = "2025-08-05T16:42:28.9Z" },
+    { url = "https://files.pythonhosted.org/packages/50/d5/8a0ae607ca07dbb34027bac8db805498ee7bfecc05fd2c148cc1ed7646e7/audioop_lts-0.2.2-cp313-abi3-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e160bf9df356d841bb6c180eeeea1834085464626dc1b68fa4e1d59070affdc3", size = 79143, upload-time = "2025-08-05T16:42:29.929Z" },
+    { url = "https://files.pythonhosted.org/packages/12/17/0d28c46179e7910bfb0bb62760ccb33edb5de973052cb2230b662c14ca2e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4b4cd51a57b698b2d06cb9993b7ac8dfe89a3b2878e96bc7948e9f19ff51dba6", size = 84313, upload-time = "2025-08-05T16:42:30.949Z" },
+    { url = "https://files.pythonhosted.org/packages/84/ba/bd5d3806641564f2024e97ca98ea8f8811d4e01d9b9f9831474bc9e14f9e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_ppc64le.whl", hash = "sha256:4a53aa7c16a60a6857e6b0b165261436396ef7293f8b5c9c828a3a203147ed4a", size = 93044, upload-time = "2025-08-05T16:42:31.959Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/5e/435ce8d5642f1f7679540d1e73c1c42d933331c0976eb397d1717d7f01a3/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_riscv64.whl", hash = "sha256:3fc38008969796f0f689f1453722a0f463da1b8a6fbee11987830bfbb664f623", size = 78766, upload-time = "2025-08-05T16:42:33.302Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/3b/b909e76b606cbfd53875693ec8c156e93e15a1366a012f0b7e4fb52d3c34/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_s390x.whl", hash = "sha256:15ab25dd3e620790f40e9ead897f91e79c0d3ce65fe193c8ed6c26cffdd24be7", size = 87640, upload-time = "2025-08-05T16:42:34.854Z" },
+    { url = "https://files.pythonhosted.org/packages/30/e7/8f1603b4572d79b775f2140d7952f200f5e6c62904585d08a01f0a70393a/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:03f061a1915538fd96272bac9551841859dbb2e3bf73ebe4a23ef043766f5449", size = 86052, upload-time = "2025-08-05T16:42:35.839Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/96/c37846df657ccdda62ba1ae2b6534fa90e2e1b1742ca8dcf8ebd38c53801/audioop_lts-0.2.2-cp313-abi3-win32.whl", hash = "sha256:3bcddaaf6cc5935a300a8387c99f7a7fbbe212a11568ec6cf6e4bc458c048636", size = 26185, upload-time = "2025-08-05T16:42:37.04Z" },
+    { url = "https://files.pythonhosted.org/packages/34/a5/9d78fdb5b844a83da8a71226c7bdae7cc638861085fff7a1d707cb4823fa/audioop_lts-0.2.2-cp313-abi3-win_amd64.whl", hash = "sha256:a2c2a947fae7d1062ef08c4e369e0ba2086049a5e598fda41122535557012e9e", size = 30503, upload-time = "2025-08-05T16:42:38.427Z" },
+    { url = "https://files.pythonhosted.org/packages/34/25/20d8fde083123e90c61b51afb547bb0ea7e77bab50d98c0ab243d02a0e43/audioop_lts-0.2.2-cp313-abi3-win_arm64.whl", hash = "sha256:5f93a5db13927a37d2d09637ccca4b2b6b48c19cd9eda7b17a2e9f77edee6a6f", size = 24173, upload-time = "2025-08-05T16:42:39.704Z" },
+    { url = "https://files.pythonhosted.org/packages/58/a7/0a764f77b5c4ac58dc13c01a580f5d32ae8c74c92020b961556a43e26d02/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:73f80bf4cd5d2ca7814da30a120de1f9408ee0619cc75da87d0641273d202a09", size = 47096, upload-time = "2025-08-05T16:42:40.684Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ed/ebebedde1a18848b085ad0fa54b66ceb95f1f94a3fc04f1cd1b5ccb0ed42/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:106753a83a25ee4d6f473f2be6b0966fc1c9af7e0017192f5531a3e7463dce58", size = 27748, upload-time = "2025-08-05T16:42:41.992Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/6e/11ca8c21af79f15dbb1c7f8017952ee8c810c438ce4e2b25638dfef2b02c/audioop_lts-0.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fbdd522624141e40948ab3e8cdae6e04c748d78710e9f0f8d4dae2750831de19", size = 27329, upload-time = "2025-08-05T16:42:42.987Z" },
+    { url = "https://files.pythonhosted.org/packages/84/52/0022f93d56d85eec5da6b9da6a958a1ef09e80c39f2cc0a590c6af81dcbb/audioop_lts-0.2.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:143fad0311e8209ece30a8dbddab3b65ab419cbe8c0dde6e8828da25999be911", size = 92407, upload-time = "2025-08-05T16:42:44.336Z" },
+    { url = "https://files.pythonhosted.org/packages/87/1d/48a889855e67be8718adbc7a01f3c01d5743c325453a5e81cf3717664aad/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfbbc74ec68a0fd08cfec1f4b5e8cca3d3cd7de5501b01c4b5d209995033cde9", size = 91811, upload-time = "2025-08-05T16:42:45.325Z" },
+    { url = "https://files.pythonhosted.org/packages/98/a6/94b7213190e8077547ffae75e13ed05edc488653c85aa5c41472c297d295/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cfcac6aa6f42397471e4943e0feb2244549db5c5d01efcd02725b96af417f3fe", size = 100470, upload-time = "2025-08-05T16:42:46.468Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/e9/78450d7cb921ede0cfc33426d3a8023a3bda755883c95c868ee36db8d48d/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:752d76472d9804ac60f0078c79cdae8b956f293177acd2316cd1e15149aee132", size = 103878, upload-time = "2025-08-05T16:42:47.576Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/e2/cd5439aad4f3e34ae1ee852025dc6aa8f67a82b97641e390bf7bd9891d3e/audioop_lts-0.2.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:83c381767e2cc10e93e40281a04852facc4cd9334550e0f392f72d1c0a9c5753", size = 84867, upload-time = "2025-08-05T16:42:49.003Z" },
+    { url = "https://files.pythonhosted.org/packages/68/4b/9d853e9076c43ebba0d411e8d2aa19061083349ac695a7d082540bad64d0/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c0022283e9556e0f3643b7c3c03f05063ca72b3063291834cca43234f20c60bb", size = 90001, upload-time = "2025-08-05T16:42:50.038Z" },
+    { url = "https://files.pythonhosted.org/packages/58/26/4bae7f9d2f116ed5593989d0e521d679b0d583973d203384679323d8fa85/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a2d4f1513d63c795e82948e1305f31a6d530626e5f9f2605408b300ae6095093", size = 99046, upload-time = "2025-08-05T16:42:51.111Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/67/a9f4fb3e250dda9e9046f8866e9fa7d52664f8985e445c6b4ad6dfb55641/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:c9c8e68d8b4a56fda8c025e538e639f8c5953f5073886b596c93ec9b620055e7", size = 84788, upload-time = "2025-08-05T16:42:52.198Z" },
+    { url = "https://files.pythonhosted.org/packages/70/f7/3de86562db0121956148bcb0fe5b506615e3bcf6e63c4357a612b910765a/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:96f19de485a2925314f5020e85911fb447ff5fbef56e8c7c6927851b95533a1c", size = 94472, upload-time = "2025-08-05T16:42:53.59Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/32/fd772bf9078ae1001207d2df1eef3da05bea611a87dd0e8217989b2848fa/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e541c3ef484852ef36545f66209444c48b28661e864ccadb29daddb6a4b8e5f5", size = 92279, upload-time = "2025-08-05T16:42:54.632Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/41/affea7181592ab0ab560044632571a38edaf9130b84928177823fbf3176a/audioop_lts-0.2.2-cp313-cp313t-win32.whl", hash = "sha256:d5e73fa573e273e4f2e5ff96f9043858a5e9311e94ffefd88a3186a910c70917", size = 26568, upload-time = "2025-08-05T16:42:55.627Z" },
+    { url = "https://files.pythonhosted.org/packages/28/2b/0372842877016641db8fc54d5c88596b542eec2f8f6c20a36fb6612bf9ee/audioop_lts-0.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9191d68659eda01e448188f60364c7763a7ca6653ed3f87ebb165822153a8547", size = 30942, upload-time = "2025-08-05T16:42:56.674Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/ca/baf2b9cc7e96c179bb4a54f30fcd83e6ecb340031bde68f486403f943768/audioop_lts-0.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:c174e322bb5783c099aaf87faeb240c8d210686b04bd61dfd05a8e5a83d88969", size = 24603, upload-time = "2025-08-05T16:42:57.571Z" },
+]
+
+[[package]]
+name = "audioread"
+version = "3.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "standard-aifc", marker = "python_full_version >= '3.13'" },
+    { name = "standard-sunau", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" },
+]
+
 [[package]]
 name = "aurelio-sdk"
 version = "0.0.19"
@@ -2176,6 +2229,59 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/67/58/317b0134129b556a93a3b0afe00ee675b5657f0155509e22fcb853bafe2d/grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3", size = 14424, upload-time = "2025-06-28T04:23:42.136Z" },
 ]

+[[package]]
+name = "grpcio-tools"
+version = "1.71.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/9a/edfefb47f11ef6b0f39eea4d8f022c5bb05ac1d14fcc7058e84a51305b73/grpcio_tools-1.71.2.tar.gz", hash = "sha256:b5304d65c7569b21270b568e404a5a843cf027c66552a6a0978b23f137679c09", size = 5330655, upload-time = "2025-06-28T04:22:00.308Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/ad/e74a4d1cffff628c2ef1ec5b9944fb098207cc4af6eb8db4bc52e6d99236/grpcio_tools-1.71.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:ab8a28c2e795520d6dc6ffd7efaef4565026dbf9b4f5270de2f3dd1ce61d2318", size = 2385557, upload-time = "2025-06-28T04:20:38.833Z" },
+    { url = "https://files.pythonhosted.org/packages/63/bf/30b63418279d6fdc4fd4a3781a7976c40c7e8ee052333b9ce6bd4ce63f30/grpcio_tools-1.71.2-cp310-cp310-macosx_10_14_universal2.whl", hash = "sha256:654ecb284a592d39a85556098b8c5125163435472a20ead79b805cf91814b99e", size = 5446915, upload-time = "2025-06-28T04:20:40.947Z" },
+    { url = "https://files.pythonhosted.org/packages/83/cd/2994e0a0a67714fdb00c207c4bec60b9b356fbd6b0b7a162ecaabe925155/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:b49aded2b6c890ff690d960e4399a336c652315c6342232c27bd601b3705739e", size = 2348301, upload-time = "2025-06-28T04:20:42.766Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/8b/4f2315927af306af1b35793b332b9ca9dc5b5a2cde2d55811c9577b5f03f/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7811a6fc1c4b4e5438e5eb98dbd52c2dc4a69d1009001c13356e6636322d41a", size = 2742159, upload-time = "2025-06-28T04:20:44.206Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/98/d513f6c09df405c82583e7083c20718ea615ed0da69ec42c80ceae7ebdc5/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393a9c80596aa2b3f05af854e23336ea8c295593bbb35d9adae3d8d7943672bd", size = 2473444, upload-time = "2025-06-28T04:20:45.5Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/fe/00af17cc841916d5e4227f11036bf443ce006629212c876937c7904b0ba3/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:823e1f23c12da00f318404c4a834bb77cd150d14387dee9789ec21b335249e46", size = 2850339, upload-time = "2025-06-28T04:20:46.758Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/59/745fc50dfdbed875fcfd6433883270d39d23fb1aa4ecc9587786f772dce3/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9bfbea79d6aec60f2587133ba766ede3dc3e229641d1a1e61d790d742a3d19eb", size = 3300795, upload-time = "2025-06-28T04:20:48.327Z" },
+    { url = "https://files.pythonhosted.org/packages/62/3e/d9d0fb2df78e601c28d02ef0cd5d007f113c1b04fc21e72bf56e8c3df66b/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:32f3a67b10728835b5ffb63fbdbe696d00e19a27561b9cf5153e72dbb93021ba", size = 2913729, upload-time = "2025-06-28T04:20:49.641Z" },
+    { url = "https://files.pythonhosted.org/packages/09/ae/ddb264b4a10c6c10336a7c177f8738b230c2c473d0c91dd5d8ce8ea1b857/grpcio_tools-1.71.2-cp310-cp310-win32.whl", hash = "sha256:7fcf9d92c710bfc93a1c0115f25e7d49a65032ff662b38b2f704668ce0a938df", size = 945997, upload-time = "2025-06-28T04:20:50.9Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/8d/5efd93698fe359f63719d934ebb2d9337e82d396e13d6bf00f4b06793e37/grpcio_tools-1.71.2-cp310-cp310-win_amd64.whl", hash = "sha256:914b4275be810290266e62349f2d020bb7cc6ecf9edb81da3c5cddb61a95721b", size = 1117474, upload-time = "2025-06-28T04:20:52.54Z" },
+    { url = "https://files.pythonhosted.org/packages/17/e4/0568d38b8da6237ea8ea15abb960fb7ab83eb7bb51e0ea5926dab3d865b1/grpcio_tools-1.71.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:0acb8151ea866be5b35233877fbee6445c36644c0aa77e230c9d1b46bf34b18b", size = 2385557, upload-time = "2025-06-28T04:20:54.323Z" },
+    { url = "https://files.pythonhosted.org/packages/76/fb/700d46f72b0f636cf0e625f3c18a4f74543ff127471377e49a071f64f1e7/grpcio_tools-1.71.2-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:b28f8606f4123edb4e6da281547465d6e449e89f0c943c376d1732dc65e6d8b3", size = 5447590, upload-time = "2025-06-28T04:20:55.836Z" },
+    { url = "https://files.pythonhosted.org/packages/12/69/d9bb2aec3de305162b23c5c884b9f79b1a195d42b1e6dabcc084cc9d0804/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:cbae6f849ad2d1f5e26cd55448b9828e678cb947fa32c8729d01998238266a6a", size = 2348495, upload-time = "2025-06-28T04:20:57.33Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/83/f840aba1690461b65330efbca96170893ee02fae66651bcc75f28b33a46c/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4d1027615cfb1e9b1f31f2f384251c847d68c2f3e025697e5f5c72e26ed1316", size = 2742333, upload-time = "2025-06-28T04:20:59.051Z" },
+    { url = "https://files.pythonhosted.org/packages/30/34/c02cd9b37de26045190ba665ee6ab8597d47f033d098968f812d253bbf8c/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bac95662dc69338edb9eb727cc3dd92342131b84b12b3e8ec6abe973d4cbf1b", size = 2473490, upload-time = "2025-06-28T04:21:00.614Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/c7/375718ae091c8f5776828ce97bdcb014ca26244296f8b7f70af1a803ed2f/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c50250c7248055040f89eb29ecad39d3a260a4b6d3696af1575945f7a8d5dcdc", size = 2850333, upload-time = "2025-06-28T04:21:01.95Z" },
+    { url = "https://files.pythonhosted.org/packages/19/37/efc69345bd92a73b2bc80f4f9e53d42dfdc234b2491ae58c87da20ca0ea5/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6ab1ad955e69027ef12ace4d700c5fc36341bdc2f420e87881e9d6d02af3d7b8", size = 3300748, upload-time = "2025-06-28T04:21:03.451Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/1f/15f787eb25ae42086f55ed3e4260e85f385921c788debf0f7583b34446e3/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dd75dde575781262b6b96cc6d0b2ac6002b2f50882bf5e06713f1bf364ee6e09", size = 2913178, upload-time = "2025-06-28T04:21:04.879Z" },
+    { url = "https://files.pythonhosted.org/packages/12/aa/69cb3a9dff7d143a05e4021c3c9b5cde07aacb8eb1c892b7c5b9fb4973e3/grpcio_tools-1.71.2-cp311-cp311-win32.whl", hash = "sha256:9a3cb244d2bfe0d187f858c5408d17cb0e76ca60ec9a274c8fd94cc81457c7fc", size = 946256, upload-time = "2025-06-28T04:21:06.518Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/df/fb951c5c87eadb507a832243942e56e67d50d7667b0e5324616ffd51b845/grpcio_tools-1.71.2-cp311-cp311-win_amd64.whl", hash = "sha256:00eb909997fd359a39b789342b476cbe291f4dd9c01ae9887a474f35972a257e", size = 1117661, upload-time = "2025-06-28T04:21:08.18Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/d3/3ed30a9c5b2424627b4b8411e2cd6a1a3f997d3812dbc6a8630a78bcfe26/grpcio_tools-1.71.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:bfc0b5d289e383bc7d317f0e64c9dfb59dc4bef078ecd23afa1a816358fb1473", size = 2385479, upload-time = "2025-06-28T04:21:10.413Z" },
+    { url = "https://files.pythonhosted.org/packages/54/61/e0b7295456c7e21ef777eae60403c06835160c8d0e1e58ebfc7d024c51d3/grpcio_tools-1.71.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b4669827716355fa913b1376b1b985855d5cfdb63443f8d18faf210180199006", size = 5431521, upload-time = "2025-06-28T04:21:12.261Z" },
+    { url = "https://files.pythonhosted.org/packages/75/d7/7bcad6bcc5f5b7fab53e6bce5db87041f38ef3e740b1ec2d8c49534fa286/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:d4071f9b44564e3f75cdf0f05b10b3e8c7ea0ca5220acbf4dc50b148552eef2f", size = 2350289, upload-time = "2025-06-28T04:21:13.625Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/8a/e4c1c4cb8c9ff7f50b7b2bba94abe8d1e98ea05f52a5db476e7f1c1a3c70/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a28eda8137d587eb30081384c256f5e5de7feda34776f89848b846da64e4be35", size = 2743321, upload-time = "2025-06-28T04:21:15.007Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/aa/95bc77fda5c2d56fb4a318c1b22bdba8914d5d84602525c99047114de531/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b19c083198f5eb15cc69c0a2f2c415540cbc636bfe76cea268e5894f34023b40", size = 2474005, upload-time = "2025-06-28T04:21:16.443Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/ff/ca11f930fe1daa799ee0ce1ac9630d58a3a3deed3dd2f465edb9a32f299d/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:784c284acda0d925052be19053d35afbf78300f4d025836d424cf632404f676a", size = 2851559, upload-time = "2025-06-28T04:21:18.139Z" },
+    { url = "https://files.pythonhosted.org/packages/64/10/c6fc97914c7e19c9bb061722e55052fa3f575165da9f6510e2038d6e8643/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:381e684d29a5d052194e095546eef067201f5af30fd99b07b5d94766f44bf1ae", size = 3300622, upload-time = "2025-06-28T04:21:20.291Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d6/965f36cfc367c276799b730d5dd1311b90a54a33726e561393b808339b04/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3e4b4801fabd0427fc61d50d09588a01b1cfab0ec5e8a5f5d515fbdd0891fd11", size = 2913863, upload-time = "2025-06-28T04:21:22.196Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/f0/c05d5c3d0c1d79ac87df964e9d36f1e3a77b60d948af65bec35d3e5c75a3/grpcio_tools-1.71.2-cp312-cp312-win32.whl", hash = "sha256:84ad86332c44572305138eafa4cc30040c9a5e81826993eae8227863b700b490", size = 945744, upload-time = "2025-06-28T04:21:23.463Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/e9/c84c1078f0b7af7d8a40f5214a9bdd8d2a567ad6c09975e6e2613a08d29d/grpcio_tools-1.71.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e1108d37eecc73b1c4a27350a6ed921b5dda25091700c1da17cfe30761cd462", size = 1117695, upload-time = "2025-06-28T04:21:25.22Z" },
+    { url = "https://files.pythonhosted.org/packages/60/9c/bdf9c5055a1ad0a09123402d73ecad3629f75b9cf97828d547173b328891/grpcio_tools-1.71.2-cp313-cp313-linux_armv7l.whl", hash = "sha256:b0f0a8611614949c906e25c225e3360551b488d10a366c96d89856bcef09f729", size = 2384758, upload-time = "2025-06-28T04:21:26.712Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d0/6aaee4940a8fb8269c13719f56d69c8d39569bee272924086aef81616d4a/grpcio_tools-1.71.2-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:7931783ea7ac42ac57f94c5047d00a504f72fbd96118bf7df911bb0e0435fc0f", size = 5443127, upload-time = "2025-06-28T04:21:28.383Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/11/50a471dcf301b89c0ed5ab92c533baced5bd8f796abfd133bbfadf6b60e5/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:d188dc28e069aa96bb48cb11b1338e47ebdf2e2306afa58a8162cc210172d7a8", size = 2349627, upload-time = "2025-06-28T04:21:30.254Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/66/e3dc58362a9c4c2fbe98a7ceb7e252385777ebb2bbc7f42d5ab138d07ace/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f36c4b3cc42ad6ef67430639174aaf4a862d236c03c4552c4521501422bfaa26", size = 2742932, upload-time = "2025-06-28T04:21:32.325Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/1e/1e07a07ed8651a2aa9f56095411198385a04a628beba796f36d98a5a03ec/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bd9ed12ce93b310f0cef304176049d0bc3b9f825e9c8c6a23e35867fed6affd", size = 2473627, upload-time = "2025-06-28T04:21:33.752Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/f9/3b7b32e4acb419f3a0b4d381bc114fe6cd48e3b778e81273fc9e4748caad/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7ce27e76dd61011182d39abca38bae55d8a277e9b7fe30f6d5466255baccb579", size = 2850879, upload-time = "2025-06-28T04:21:35.241Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/99/cd9e1acd84315ce05ad1fcdfabf73b7df43807cf00c3b781db372d92b899/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:dcc17bf59b85c3676818f2219deacac0156492f32ca165e048427d2d3e6e1157", size = 3300216, upload-time = "2025-06-28T04:21:36.826Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/c0/66eab57b14550c5b22404dbf60635c9e33efa003bd747211981a9859b94b/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:706360c71bdd722682927a1fb517c276ccb816f1e30cb71f33553e5817dc4031", size = 2913521, upload-time = "2025-06-28T04:21:38.347Z" },
+    { url = "https://files.pythonhosted.org/packages/05/9b/7c90af8f937d77005625d705ab1160bc42a7e7b021ee5c788192763bccd6/grpcio_tools-1.71.2-cp313-cp313-win32.whl", hash = "sha256:bcf751d5a81c918c26adb2d6abcef71035c77d6eb9dd16afaf176ee096e22c1d", size = 945322, upload-time = "2025-06-28T04:21:39.864Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/80/6db6247f767c94fe551761772f89ceea355ff295fd4574cb8efc8b2d1199/grpcio_tools-1.71.2-cp313-cp313-win_amd64.whl", hash = "sha256:b1581a1133552aba96a730178bc44f6f1a071f0eb81c5b6bc4c0f89f5314e2b8", size = 1117234, upload-time = "2025-06-28T04:21:41.893Z" },
+]
+
 [[package]]
 name = "gunicorn"
 version = "23.0.0"
@@ -3174,6 +3280,13 @@ semantic-router = [
    { name = "aurelio-sdk" },
    { name = "semantic-router" },
 ]
+stt-nvidia-riva = [
+    { name = "audioread" },
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "nvidia-riva-client" },
+    { name = "soundfile" },
+]
 utils = [
    { name = "numpydoc" },
 ]
@@ -3264,6 +3377,7 @@ requires-dist = [
    { name = "aiohttp", specifier = "==3.13.4" },
    { name = "anthropic", extras = ["vertex"], marker = "extra == 'proxy-runtime'", specifier = "==0.84.0" },
    { name = "apscheduler", marker = "extra == 'proxy'", specifier = "==3.11.2" },
+    { name = "audioread", marker = "extra == 'stt-nvidia-riva'", specifier = ">=3.0.1" },
    { name = "aurelio-sdk", marker = "python_full_version < '3.14' and extra == 'semantic-router'", specifier = "==0.0.19" },
    { name = "azure-ai-contentsafety", marker = "extra == 'proxy-runtime'", specifier = "==1.0.0" },
    { name = "azure-identity", marker = "extra == 'extra-proxy'", specifier = "==1.25.2" },
@@ -3300,7 +3414,9 @@ requires-dist = [
    { name = "mangum", marker = "extra == 'proxy-runtime'", specifier = "==0.17.0" },
    { name = "mcp", marker = "extra == 'proxy'", specifier = "==1.26.0" },
    { name = "mlflow", marker = "extra == 'mlflow'", specifier = "==3.11.1" },
+    { name = "numpy", marker = "extra == 'stt-nvidia-riva'", specifier = ">=1.26.0" },
    { name = "numpydoc", marker = "extra == 'utils'", specifier = "==1.8.0" },
+    { name = "nvidia-riva-client", marker = "extra == 'stt-nvidia-riva'", specifier = ">=2.15.0" },
    { name = "openai", specifier = "==2.33.0" },
    { name = "opentelemetry-api", marker = "extra == 'proxy-runtime'", specifier = "==1.28.0" },
    { name = "opentelemetry-exporter-otlp", marker = "extra == 'proxy-runtime'", specifier = "==1.28.0" },
@@ -3325,13 +3441,14 @@ requires-dist = [
    { name = "semantic-router", marker = "python_full_version < '3.14' and extra == 'semantic-router'", specifier = "==0.1.12" },
    { name = "sentry-sdk", marker = "extra == 'proxy-runtime'", specifier = "==2.21.0" },
    { name = "soundfile", marker = "extra == 'proxy'", specifier = "==0.12.1" },
+    { name = "soundfile", marker = "extra == 'stt-nvidia-riva'", specifier = ">=0.12.1" },
    { name = "tiktoken", specifier = "==0.12.0" },
    { name = "tokenizers", specifier = "==0.23.1" },
    { name = "uvicorn", marker = "extra == 'proxy'", specifier = "==0.33.0" },
    { name = "uvloop", marker = "sys_platform != 'win32' and extra == 'proxy'", specifier = "==0.21.0" },
    { name = "websockets", marker = "extra == 'proxy'", specifier = "==15.0.1" },
 ]
-provides-extras = ["proxy", "extra-proxy", "utils", "caching", "semantic-router", "mlflow", "grpc", "google", "proxy-runtime"]
+provides-extras = ["proxy", "extra-proxy", "utils", "caching", "semantic-router", "mlflow", "grpc", "stt-nvidia-riva", "google", "proxy-runtime"]

 [package.metadata.requires-dev]
 ci = [
@@ -4156,6 +4273,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6c/45/56d99ba9366476cd8548527667f01869279cedb9e66b28eb4dfb27701679/numpydoc-1.8.0-py3-none-any.whl", hash = "sha256:72024c7fd5e17375dec3608a27c03303e8ad00c81292667955c6fea7a3ccf541", size = 64003, upload-time = "2024-08-09T15:52:37.276Z" },
 ]

+[[package]]
+name = "nvidia-riva-client"
+version = "2.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio-tools" },
+    { name = "setuptools" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/82/0484c225bebe7ed37334474fba5c6ac7228638e692b84da0a0e7f2395672/nvidia_riva_client-2.16.0-py3-none-any.whl", hash = "sha256:99ef37b8f487d75a70c053736848221e09b728e5c910fb476333d375bd4347a3", size = 45491, upload-time = "2024-07-02T14:54:22.63Z" },
+]
+
 [[package]]
 name = "oauthlib"
 version = "3.3.1"
@@ -7068,6 +7197,40 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ff/07/45c21ed03d708c477367305726b89919b020a3a2a01f72aaf5ad941caf35/sse_starlette-3.4.1-py3-none-any.whl", hash = "sha256:6b43cf21f1d574d582a6e1b0cfbde1c94dc86a32a701a7168c99c4475c6bd1d0", size = 16487, upload-time = "2026-04-26T13:32:30.819Z" },
 ]

+[[package]]
+name = "standard-aifc"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+    { name = "standard-chunk", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" },
+]
+
+[[package]]
+name = "standard-chunk"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" },
+]
+
+[[package]]
+name = "standard-sunau"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" },
+]
+
 [[package]]
 name = "starlette"
 version = "0.50.0"