diff --git a/litellm/__init__.py b/litellm/__init__.py index 5305edc9be..e61ef25057 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -586,6 +586,7 @@ anyscale_models: Set = set() cerebras_models: Set = set() galadriel_models: Set = set() nvidia_nim_models: Set = set() +nvidia_riva_models: Set = set() sambanova_models: Set = set() sambanova_embedding_models: Set = set() novita_models: Set = set() @@ -812,6 +813,8 @@ def add_known_models(model_cost_map: Optional[Dict] = None): galadriel_models.add(key) elif value.get("litellm_provider") == "nvidia_nim": nvidia_nim_models.add(key) + elif value.get("litellm_provider") == "nvidia_riva": + nvidia_riva_models.add(key) elif value.get("litellm_provider") == "sambanova": sambanova_models.add(key) elif value.get("litellm_provider") == "sambanova-embedding-models": @@ -971,6 +974,7 @@ model_list = list( | cerebras_models | galadriel_models | nvidia_nim_models + | nvidia_riva_models | sambanova_models | azure_text_models | novita_models @@ -1067,6 +1071,7 @@ models_by_provider: dict = { "cerebras": cerebras_models, "galadriel": galadriel_models, "nvidia_nim": nvidia_nim_models, + "nvidia_riva": nvidia_riva_models, "sambanova": sambanova_models | sambanova_embedding_models, "novita": novita_models, "nebius": nebius_models | nebius_embedding_models, @@ -1618,6 +1623,9 @@ if TYPE_CHECKING: from .llms.deepgram.audio_transcription.transformation import ( DeepgramAudioTranscriptionConfig as DeepgramAudioTranscriptionConfig, ) + from .llms.nvidia_riva.audio_transcription.transformation import ( + NvidiaRivaAudioTranscriptionConfig as NvidiaRivaAudioTranscriptionConfig, + ) from .llms.topaz.image_variations.transformation import ( TopazImageVariationConfig as TopazImageVariationConfig, ) diff --git a/litellm/litellm_core_utils/get_llm_provider_logic.py b/litellm/litellm_core_utils/get_llm_provider_logic.py index c0ca6835ee..ba6d438f16 100644 --- a/litellm/litellm_core_utils/get_llm_provider_logic.py +++ b/litellm/litellm_core_utils/get_llm_provider_logic.py @@ -621,6 +621,18 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 or "https://integrate.api.nvidia.com/v1" ) # type: ignore dynamic_api_key = api_key or get_secret_str("NVIDIA_NIM_API_KEY") + elif custom_llm_provider == "nvidia_riva": + # NVIDIA Riva is gRPC-based; api_base must be a host:port like + # `grpc.nvcf.nvidia.com:443` or `localhost:50051`. There is no + # public-default endpoint, so we do not fill one in here. + api_base = api_base or get_secret_str("NVIDIA_RIVA_API_BASE") # type: ignore + # Fall back to NVIDIA_NIM_API_KEY because users running both NVCF + # services typically reuse the same nvapi-* key. + dynamic_api_key = ( + api_key + or get_secret_str("NVIDIA_RIVA_API_KEY") + or get_secret_str("NVIDIA_NIM_API_KEY") + ) elif custom_llm_provider == "cerebras": api_base = ( api_base or get_secret("CEREBRAS_API_BASE") or "https://api.cerebras.ai/v1" diff --git a/litellm/llms/nvidia_riva/__init__.py b/litellm/llms/nvidia_riva/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/litellm/llms/nvidia_riva/audio_transcription/__init__.py b/litellm/llms/nvidia_riva/audio_transcription/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/litellm/llms/nvidia_riva/audio_transcription/audio_utils.py b/litellm/llms/nvidia_riva/audio_transcription/audio_utils.py new file mode 100644 index 0000000000..253d6d2f73 --- /dev/null +++ b/litellm/llms/nvidia_riva/audio_transcription/audio_utils.py @@ -0,0 +1,232 @@ +""" +Audio resampling utilities for the NVIDIA Riva STT provider. + +We intentionally avoid a hard dependency on ``ffmpeg`` so this works in +slim Python environments. Format coverage: + +- ``soundfile`` handles wav / flac / ogg out of the box (libsndfile). +- ``audioread`` is tried for everything ``soundfile`` cannot decode (mp3, + m4a, mp4, webm, ...). This is a soft optional dependency. + +If neither library can decode the input we raise a clear error instructing +the caller to convert the audio upstream. +""" + +import io +import os +import tempfile +from dataclasses import dataclass +from typing import Any, Tuple, cast + +from litellm.llms.nvidia_riva.audio_transcription.transformation import ( + RIVA_TARGET_NUM_CHANNELS, + RIVA_TARGET_SAMPLE_RATE_HZ, +) +from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException + +# Keep this as Any: the module intentionally avoids importing numpy at module +# import time (optional dependency), and project-wide mypy config evaluates this +# file in contexts where conditional type aliases can degrade to "FloatArray?". +FloatArray = Any + + +_INSTALL_HINT = ( + "Install Riva STT extras to enable automatic audio resampling: " + "`pip install 'litellm[stt-nvidia-riva]'`" +) + + +@dataclass +class ResampledAudio: + pcm_bytes: bytes + duration_seconds: float + sample_rate_hz: int + num_channels: int + + +def resample_to_riva_pcm(file_bytes: bytes) -> ResampledAudio: + """ + Decode ``file_bytes`` and produce 16 kHz mono LINEAR_PCM (int16 little + endian) suitable for streaming to Riva, plus the audio duration in + seconds (used for cost calculation when Riva does not return usage). + """ + try: + import numpy as np # type: ignore + except ImportError as e: + raise NvidiaRivaException( + status_code=500, + message=f"numpy is required for Riva audio resampling. {_INSTALL_HINT}", + ) from e + + samples_float, source_rate = _decode_to_float32(file_bytes) + + # Downmix to mono by averaging channels. + if samples_float.ndim == 2 and samples_float.shape[1] > 1: + samples_float = samples_float.mean(axis=1) + elif samples_float.ndim == 2: + samples_float = samples_float[:, 0] + + samples_float = np.asarray(samples_float, dtype=np.float32).ravel() + + if source_rate != RIVA_TARGET_SAMPLE_RATE_HZ: + samples_float = _resample( + samples_float, source_rate, RIVA_TARGET_SAMPLE_RATE_HZ + ) + + # Clip + convert float [-1, 1] to int16 little-endian PCM. + np.clip(samples_float, -1.0, 1.0, out=samples_float) + pcm_int16 = (samples_float * 32767.0).astype(" Tuple["FloatArray", int]: + """ + Decode arbitrary audio bytes into a float32 array shaped either + ``(n_samples,)`` (mono) or ``(n_samples, n_channels)`` plus the source + sample rate. + + Tries ``soundfile`` first (wav/flac/ogg), then falls back to + ``audioread`` for compressed formats. Raises a clear error if neither + works. + """ + import numpy as np # type: ignore + + sf_error: Exception | None = None + try: + import soundfile as sf # type: ignore + + with io.BytesIO(file_bytes) as buf: + data, source_rate = sf.read(buf, dtype="float32", always_2d=False) + return cast("FloatArray", data), int(source_rate) + except ImportError as e: + sf_error = e + except Exception as e: + # soundfile raises RuntimeError / LibsndfileError for formats it + # cannot decode (mp3 on older libsndfile, m4a, webm, ...). + sf_error = e + + try: + import audioread # type: ignore + except ImportError as e: + raise NvidiaRivaException( + status_code=400, + message=( + "Could not decode audio for Riva STT. Install audio extras " + f"(`pip install 'litellm[stt-nvidia-riva]'`) or convert your " + f"audio to wav/flac/ogg before calling the API. " + f"Underlying error: {sf_error}" + ), + ) from e + + # audioread backends (FFmpeg subprocess, GStreamer, Core Audio) require a + # filesystem path, so spill the bytes to a temp file. mkstemp is portable + # to Windows where re-opening a NamedTemporaryFile is not allowed. + fd, tmp_path = tempfile.mkstemp(suffix=".audio") + try: + with os.fdopen(fd, "wb") as tmp_file: + tmp_file.write(file_bytes) + try: + with audioread.audio_open(tmp_path) as src: + source_rate = int(src.samplerate) + channels = int(src.channels) + chunks = [] + for buf in src: + chunks.append(np.frombuffer(buf, dtype=np.int16)) + if not chunks: + raise NvidiaRivaException( + status_code=400, + message="Audio decode produced no samples.", + ) + interleaved = np.concatenate(chunks).astype(np.float32) / 32768.0 + if channels > 1: + interleaved = interleaved.reshape(-1, channels) + return cast("FloatArray", interleaved), source_rate + except NvidiaRivaException: + raise + except Exception as e: + raise NvidiaRivaException( + status_code=400, + message=( + "Could not decode audio for Riva STT. Convert your audio to " + f"wav/flac/ogg before calling the API. Underlying error: {e}" + ), + ) from e + finally: + try: + os.unlink(tmp_path) + except OSError: + pass + + +def _resample( + samples: "FloatArray", source_rate: int, target_rate: int +) -> "FloatArray": + """ + Resample mono float32 ``samples`` from ``source_rate`` to ``target_rate``. + + Prefers high-quality polyphase resampling when ``soxr`` or ``scipy`` is + available (anti-aliased, important for downsampling 44.1/48 kHz -> 16 kHz + where naive interpolation folds high frequencies back into the speech + band). Falls back to linear interpolation if neither is installed — + acceptable for speech-only mono input but lossy for wideband content. + """ + import numpy as np # type: ignore + + if source_rate == target_rate or samples.size == 0: + return samples + + try: + import soxr # type: ignore + + return cast( + "FloatArray", + np.asarray( + soxr.resample(samples, source_rate, target_rate), dtype=np.float32 + ), + ) + except ImportError: + pass + + try: + from math import gcd + + from scipy.signal import resample_poly # type: ignore + + g = gcd(int(source_rate), int(target_rate)) + up = int(target_rate) // g + down = int(source_rate) // g + return cast( + "FloatArray", np.asarray(resample_poly(samples, up, down), dtype=np.float32) + ) + except ImportError: + pass + + return _linear_resample(samples, source_rate, target_rate) + + +def _linear_resample( + samples: "FloatArray", source_rate: int, target_rate: int +) -> "FloatArray": + """Linear-interpolation fallback. See :func:`_resample` for caveats.""" + import numpy as np # type: ignore + + duration = samples.size / float(source_rate) + target_length = int(round(duration * target_rate)) + if target_length <= 1: + return samples.astype(np.float32) + + src_indices = np.linspace(0, samples.size - 1, num=target_length, dtype=np.float64) + left = np.floor(src_indices).astype(np.int64) + right = np.minimum(left + 1, samples.size - 1) + frac = (src_indices - left).astype(np.float32) + + return ((1.0 - frac) * samples[left] + frac * samples[right]).astype(np.float32) diff --git a/litellm/llms/nvidia_riva/audio_transcription/handler.py b/litellm/llms/nvidia_riva/audio_transcription/handler.py new file mode 100644 index 0000000000..9740162ba1 --- /dev/null +++ b/litellm/llms/nvidia_riva/audio_transcription/handler.py @@ -0,0 +1,444 @@ +""" +NVIDIA Riva STT handler. + +This module bridges litellm's transcription dispatch to NVIDIA Riva's gRPC +streaming ASR API. We do *not* go through ``base_llm_http_handler`` because +Riva is gRPC-only: HTTP-shaped abstractions (``httpx.Response``, +``api_base/v1/...`` URLs, multipart bodies) do not apply. + +The handler is intentionally a thin orchestration layer: + +1. Resample the inbound audio to 16 kHz mono LINEAR_PCM (Riva's required + wire format). +2. Build ``RecognitionConfig`` / ``StreamingRecognitionConfig`` protobufs + from the structured dict produced by + :class:`NvidiaRivaAudioTranscriptionConfig`. +3. Construct ``riva.client.Auth`` honoring NVCF (function-id metadata + TLS) + vs self-hosted (any host:port, optional TLS) modes. +4. Stream the audio through Riva's ``streaming_response_generator`` and + aggregate ``is_final`` results into a single transcript. +5. Return a normalized ``TranscriptionResponse`` with ``duration`` exposed + on ``_hidden_params`` so cost calculation works. + +``riva-client`` is imported lazily so ``litellm`` core remains usable +without the optional STT extras installed. +""" + +import asyncio +import inspect +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from litellm.litellm_core_utils.audio_utils.utils import ( + get_audio_file_name, + process_audio_file, +) +from litellm.llms.nvidia_riva.audio_transcription.audio_utils import ( + resample_to_riva_pcm, +) +from litellm.llms.nvidia_riva.audio_transcription.transformation import ( + NvidiaRivaAudioTranscriptionConfig, + RIVA_TARGET_NUM_CHANNELS, + RIVA_TARGET_SAMPLE_RATE_HZ, +) +from litellm.llms.nvidia_riva.common_utils import ( + NvidiaRivaException, + grpc_error_to_litellm_exception, +) +from litellm.types.utils import FileTypes, TranscriptionResponse +from litellm.utils import convert_to_model_response_object + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import ( + Logging as LiteLLMLoggingObj, + ) + +# Stream audio to Riva in ~50 ms slices (1600 samples at 16 kHz). Matches +# NVIDIA's recommended chunk size for streaming ASR — small enough for +# responsive endpointing, large enough to keep per-RPC overhead low. +_DEFAULT_CHUNK_SAMPLES = 1600 +_DEFAULT_CHUNK_BYTES = _DEFAULT_CHUNK_SAMPLES * 2 # int16 = 2 bytes/sample + + +_RIVA_INSTALL_HINT = ( + "NVIDIA Riva client is not installed. " + "Install with `pip install 'litellm[stt-nvidia-riva]'`." +) + + +class NvidiaRivaAudioTranscription: + """Sync + async entry point for Riva ASR.""" + + def audio_transcriptions( + self, + model: str, + audio_file: FileTypes, + optional_params: dict, + litellm_params: dict, + model_response: TranscriptionResponse, + timeout: float, + logging_obj: "LiteLLMLoggingObj", + api_key: Optional[str], + api_base: Optional[str], + atranscription: bool = False, + provider_config: Optional[NvidiaRivaAudioTranscriptionConfig] = None, + ): + if provider_config is None: + provider_config = NvidiaRivaAudioTranscriptionConfig() + + if atranscription: + return self.async_audio_transcriptions( + model=model, + audio_file=audio_file, + optional_params=optional_params, + litellm_params=litellm_params, + model_response=model_response, + timeout=timeout, + logging_obj=logging_obj, + api_key=api_key, + api_base=api_base, + provider_config=provider_config, + ) + + return self._run_sync( + model=model, + audio_file=audio_file, + optional_params=optional_params, + litellm_params=litellm_params, + model_response=model_response, + timeout=timeout, + logging_obj=logging_obj, + api_key=api_key, + api_base=api_base, + provider_config=provider_config, + atranscription=atranscription, + ) + + async def async_audio_transcriptions( + self, + model: str, + audio_file: FileTypes, + optional_params: dict, + litellm_params: dict, + model_response: TranscriptionResponse, + timeout: float, + logging_obj: "LiteLLMLoggingObj", + api_key: Optional[str], + api_base: Optional[str], + provider_config: Optional[NvidiaRivaAudioTranscriptionConfig] = None, + ) -> TranscriptionResponse: + # ``riva-client`` exposes a sync streaming generator, so we offload + # the blocking call to a worker thread to keep the event loop free. + return await asyncio.to_thread( + self._run_sync, + model=model, + audio_file=audio_file, + optional_params=optional_params, + litellm_params=litellm_params, + model_response=model_response, + timeout=timeout, + logging_obj=logging_obj, + api_key=api_key, + api_base=api_base, + provider_config=provider_config or NvidiaRivaAudioTranscriptionConfig(), + atranscription=True, + ) + + def _run_sync( + self, + model: str, + audio_file: FileTypes, + optional_params: dict, + litellm_params: dict, + model_response: TranscriptionResponse, + timeout: float, + logging_obj: "LiteLLMLoggingObj", + api_key: Optional[str], + api_base: Optional[str], + provider_config: NvidiaRivaAudioTranscriptionConfig, + atranscription: bool = False, + ) -> TranscriptionResponse: + if not api_base: + raise NvidiaRivaException( + status_code=400, + message=( + "NVIDIA Riva requires `api_base` (host:port for the gRPC " + "endpoint, e.g. `grpc.nvcf.nvidia.com:443` or " + "`localhost:50051`). Set it in litellm_params or via " + "NVIDIA_RIVA_API_BASE." + ), + ) + + processed = process_audio_file(audio_file) + resampled = resample_to_riva_pcm(processed.file_content) + + request_payload = provider_config.transform_audio_transcription_request( + model=model, + audio_file=audio_file, + optional_params=optional_params, + litellm_params={ + **litellm_params, + "api_base": api_base, + "api_key": api_key, + }, + ).data + if not isinstance(request_payload, dict): + raise NvidiaRivaException( + status_code=500, + message="NvidiaRivaAudioTranscriptionConfig produced an unexpected request payload type.", + ) + + recognition_config_dict: Dict[str, Any] = request_payload["recognition_config"] + # The wire format is fixed by our resampler; override anything stale + # the caller passed in so the gRPC config matches the bytes we send. + recognition_config_dict["sample_rate_hertz"] = RIVA_TARGET_SAMPLE_RATE_HZ + recognition_config_dict["audio_channel_count"] = RIVA_TARGET_NUM_CHANNELS + recognition_config_dict["encoding"] = "LINEAR_PCM" + + response_format = request_payload.get("response_format") or "json" + timestamp_granularities = request_payload.get("timestamp_granularities") + + riva_module, riva_asr_module = _import_riva() + auth_obj = self._construct_auth( + riva_module=riva_module, + api_base=api_base, + api_key=api_key, + optional_params=optional_params, + ) + + recognition_config = self._build_recognition_config_proto( + riva_asr_module=riva_asr_module, + recognition_config_dict=recognition_config_dict, + ) + streaming_config = riva_asr_module.StreamingRecognitionConfig( + config=recognition_config, interim_results=False + ) + + logging_obj.pre_call( + input=None, + api_key=api_key, + additional_args={ + "api_base": api_base, + "atranscription": atranscription, + "complete_input_dict": { + "recognition_config": recognition_config_dict, + "nvcf_function_id_set": bool( + optional_params.get("nvcf_function_id") + ), + "use_ssl": optional_params.get("use_ssl"), + }, + }, + ) + + try: + asr_service = riva_module.ASRService(auth_obj) + audio_chunks = self._iter_audio_chunks(resampled.pcm_bytes) + stream_kwargs: Dict[str, Any] = { + "audio_chunks": audio_chunks, + "streaming_config": streaming_config, + } + # Forward the deadline so the stream cannot block forever if the + # server stalls. Older riva-client versions do not accept a + # ``timeout`` kwarg, so pass it only when supported. + if timeout is not None and self._supports_timeout_kwarg( + asr_service.streaming_response_generator + ): + stream_kwargs["timeout"] = float(timeout) + stream = asr_service.streaming_response_generator(**stream_kwargs) + final_results = self._collect_final_results(stream) + except NvidiaRivaException: + raise + except Exception as e: + raise grpc_error_to_litellm_exception(e) from e + + transcription = NvidiaRivaAudioTranscriptionConfig.build_transcription_response( + final_results=final_results, + response_format=response_format, + duration_seconds=resampled.duration_seconds, + timestamp_granularities=timestamp_granularities, + ) + + stringified_response = dict(transcription) + + logging_obj.post_call( + input=get_audio_file_name(audio_file), + api_key=api_key, + additional_args={"complete_input_dict": recognition_config_dict}, + original_response=stringified_response, + ) + + hidden_params = { + "model": model, + "custom_llm_provider": "nvidia_riva", + "audio_transcription_duration": resampled.duration_seconds, + } + + final_response: TranscriptionResponse = convert_to_model_response_object( # type: ignore + response_object=stringified_response, + model_response_object=model_response, + hidden_params=hidden_params, + response_type="audio_transcription", + ) + + return final_response + + def _construct_auth( + self, + riva_module: Any, + api_base: str, + api_key: Optional[str], + optional_params: dict, + ) -> Any: + """ + Build a ``riva.client.Auth`` object. + + - When ``nvcf_function_id`` is provided we attach the NVCF + ``function-id`` and bearer ``authorization`` metadata, and default + ``use_ssl`` to True (NVCF endpoints are TLS-only). + - Otherwise (self-hosted) we default ``use_ssl`` to False but still + honor an explicit override — self-hosted Riva behind an ingress + with TLS termination is a real deployment topology. + """ + nvcf_function_id = optional_params.get("nvcf_function_id") + use_ssl_override = optional_params.get("use_ssl") + use_ssl = ( + bool(use_ssl_override) + if use_ssl_override is not None + else bool(nvcf_function_id) + ) + + metadata: List[Tuple[str, str]] = [] + if nvcf_function_id: + metadata.append(("function-id", str(nvcf_function_id))) + if api_key: + metadata.append(("authorization", f"Bearer {api_key}")) + + try: + return riva_module.Auth( + uri=api_base, use_ssl=use_ssl, metadata_args=metadata + ) + except TypeError: + # Older riva-client signatures used positional-only args. + return riva_module.Auth(None, use_ssl, api_base, metadata) + + def _build_recognition_config_proto( + self, riva_asr_module: Any, recognition_config_dict: Dict[str, Any] + ): + encoding_name = ( + recognition_config_dict.get("encoding") or "LINEAR_PCM" + ).upper() + encoding_enum = getattr( + riva_asr_module.AudioEncoding, + encoding_name, + riva_asr_module.AudioEncoding.LINEAR_PCM, + ) + + config = riva_asr_module.RecognitionConfig( + encoding=encoding_enum, + sample_rate_hertz=int(recognition_config_dict["sample_rate_hertz"]), + language_code=recognition_config_dict["language_code"], + audio_channel_count=int(recognition_config_dict["audio_channel_count"]), + enable_automatic_punctuation=bool( + recognition_config_dict.get("enable_automatic_punctuation", True) + ), + enable_word_time_offsets=bool( + recognition_config_dict.get("enable_word_time_offsets", False) + ), + max_alternatives=int(recognition_config_dict.get("max_alternatives", 1)), + model=recognition_config_dict.get("model", "") or "", + verbatim_transcripts=bool( + recognition_config_dict.get("verbatim_transcripts", False) + ), + profanity_filter=bool( + recognition_config_dict.get("profanity_filter", False) + ), + ) + + endpointing = recognition_config_dict.get("endpointing_config") + if isinstance(endpointing, dict) and endpointing: + try: + ep = riva_asr_module.EndpointingConfig(**endpointing) + config.endpointing_config.CopyFrom(ep) + except Exception: + # If the user supplied an unknown EndpointingConfig field + # (older Riva server), fall back to Riva's defaults rather + # than failing the whole request. + pass + + return config + + @staticmethod + def _supports_timeout_kwarg(callable_obj: Any) -> bool: + try: + sig = inspect.signature(callable_obj) + except (TypeError, ValueError): + return False + params = sig.parameters + if "timeout" in params: + return True + return any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()) + + @staticmethod + def _iter_audio_chunks(pcm_bytes: bytes): + for offset in range(0, len(pcm_bytes), _DEFAULT_CHUNK_BYTES): + chunk = pcm_bytes[offset : offset + _DEFAULT_CHUNK_BYTES] + if not chunk: + continue + yield chunk + + @staticmethod + def _collect_final_results(stream) -> List[Dict[str, Any]]: + """ + Walk the gRPC stream, ignore empty / non-final chunks, and return a + list of normalized final-result dicts. Matching the user's note: the + ``id`` blocks with no ``results`` are streaming heartbeats and must + be skipped. + """ + final_results: List[Dict[str, Any]] = [] + for response in stream: + results = getattr(response, "results", None) or [] + for result in results: + if not getattr(result, "is_final", False): + continue + alternatives = getattr(result, "alternatives", None) or [] + if not alternatives: + continue + top = alternatives[0] + transcript = getattr(top, "transcript", "") or "" + words_proto = getattr(top, "words", None) or [] + words = [] + for word in words_proto: + words.append( + { + "word": getattr(word, "word", ""), + "start_time_ms": int(getattr(word, "start_time", 0) or 0), + "end_time_ms": int(getattr(word, "end_time", 0) or 0), + } + ) + final_results.append({"transcript": transcript, "words": words}) + return final_results + + +def _import_riva(): + """ + Lazy import of ``riva.client`` and ``riva.client.proto.riva_asr_pb2``. + + We try the SDK first (preferred) and fall back to importing the proto + module separately when the SDK packaging changes between versions. + """ + try: + import riva.client as riva_client # type: ignore + except ImportError as e: + raise NvidiaRivaException(status_code=500, message=_RIVA_INSTALL_HINT) from e + + riva_asr_module = riva_client + if not hasattr(riva_asr_module, "RecognitionConfig"): + try: + import riva.client.proto.riva_asr_pb2 as riva_asr_pb2 # type: ignore + + riva_asr_module = riva_asr_pb2 + except ImportError as e: + raise NvidiaRivaException( + status_code=500, message=_RIVA_INSTALL_HINT + ) from e + + return riva_client, riva_asr_module diff --git a/litellm/llms/nvidia_riva/audio_transcription/transformation.py b/litellm/llms/nvidia_riva/audio_transcription/transformation.py new file mode 100644 index 0000000000..c2dfc25d94 --- /dev/null +++ b/litellm/llms/nvidia_riva/audio_transcription/transformation.py @@ -0,0 +1,284 @@ +""" +Translates from OpenAI's `/v1/audio/transcriptions` to NVIDIA Riva's gRPC +streaming recognize API. + +Riva is gRPC-only, so unlike most providers in this directory the request +"transformation" produced here is a structured dict consumed directly by the +gRPC handler (rather than HTTP form-data). The handler builds Riva +``RecognitionConfig`` / ``StreamingRecognitionConfig`` protobufs from this +dict at call time. + +Reference: https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-overview.html +""" + +from typing import Any, Dict, List, Optional, Union + +from httpx import Headers, Response + +from litellm.llms.base_llm.chat.transformation import BaseLLMException +from litellm.types.llms.openai import ( + AllMessageValues, + OpenAIAudioTranscriptionOptionalParams, +) +from litellm.types.utils import FileTypes, TranscriptionResponse + +from ...base_llm.audio_transcription.transformation import ( + AudioTranscriptionRequestData, + BaseAudioTranscriptionConfig, +) +from ..common_utils import NvidiaRivaException + +# Riva expects a fixed wire format for the audio chunks we stream in. +RIVA_TARGET_SAMPLE_RATE_HZ = 16000 +RIVA_TARGET_NUM_CHANNELS = 1 +RIVA_TARGET_ENCODING = "LINEAR_PCM" + + +class NvidiaRivaAudioTranscriptionConfig(BaseAudioTranscriptionConfig): + """ + Config for NVIDIA Riva ASR (gRPC). + + Supports both NVCF-hosted (``api_base=grpc.nvcf.nvidia.com:443`` + + ``nvcf_function_id``) and self-hosted deployments (any ``host:port``, + optional TLS via ``use_ssl``). + """ + + def get_supported_openai_params( + self, model: str + ) -> List[OpenAIAudioTranscriptionOptionalParams]: + # Riva natively understands language + word timestamps. + # `response_format` is honored at response-shaping time in the handler. + return ["language", "response_format", "timestamp_granularities"] + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + for key, value in non_default_params.items(): + if value is None: + continue + + if key == "language": + optional_params["language_code"] = self._normalize_language_code(value) + elif key == "timestamp_granularities": + # OpenAI accepts ["word"], ["segment"], or both. Riva only + # natively exposes word timing, so we toggle it on whenever + # "word" is requested. Segment timing is reconstructed in the + # response transformer. + if isinstance(value, list) and "word" in value: + optional_params["enable_word_time_offsets"] = True + optional_params["timestamp_granularities"] = value + elif key == "response_format": + # Stored verbatim; consumed by transform_audio_transcription_response. + optional_params["response_format"] = value + else: + optional_params[key] = value + + return optional_params + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, Headers] + ) -> BaseLLMException: + return NvidiaRivaException( + message=error_message, status_code=status_code, headers=headers + ) + + def transform_audio_transcription_request( + self, + model: str, + audio_file: FileTypes, + optional_params: dict, + litellm_params: dict, + ) -> AudioTranscriptionRequestData: + """ + Build a structured dict that the gRPC handler consumes. We do *not* + construct protobufs here, so this module remains importable without + ``nvidia-riva-client`` being installed (matching how other providers + defer SDK imports to handler-call time). + """ + recognition_config = self._build_recognition_config_dict( + model=model, + optional_params=optional_params, + ) + + endpointing_config = self._build_endpointing_config_dict(optional_params) + if endpointing_config is not None: + recognition_config["endpointing_config"] = endpointing_config + + request_payload: Dict[str, Any] = { + "recognition_config": recognition_config, + "response_format": optional_params.get("response_format") or "json", + "timestamp_granularities": optional_params.get("timestamp_granularities"), + } + + return AudioTranscriptionRequestData(data=request_payload, files=None) + + def transform_audio_transcription_response( + self, + raw_response: Response, + ) -> TranscriptionResponse: + # Not used: Riva responses come from a gRPC stream, not an httpx + # response. The handler calls _build_transcription_response directly. + raise NotImplementedError( + "NvidiaRivaAudioTranscriptionConfig.transform_audio_transcription_response " + "is not used. The handler builds the TranscriptionResponse directly " + "from Riva's gRPC streaming results." + ) + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + # gRPC auth is constructed in the handler, not via HTTP headers. + return headers + + def _build_recognition_config_dict( + self, model: str, optional_params: dict + ) -> Dict[str, Any]: + """ + Build the Riva ``RecognitionConfig`` shape as a plain dict. + + ``model`` is intentionally left empty when the user has not supplied + ``riva_model_name``. Riva auto-selects the right deployment from + ``language_code`` + ``sample_rate_hertz``. NVIDIA's internal + deployment names (e.g. ``parakeet-1.1b-en-US-asr-streaming-...``) + change across NIM versions, regions, and self-hosted builds, so + hardcoding any name here would break unpredictably. + """ + return { + "language_code": optional_params.get("language_code", "en-US"), + "sample_rate_hertz": optional_params.get( + "sample_rate_hertz", RIVA_TARGET_SAMPLE_RATE_HZ + ), + "encoding": optional_params.get("encoding", RIVA_TARGET_ENCODING), + "audio_channel_count": optional_params.get( + "audio_channel_count", RIVA_TARGET_NUM_CHANNELS + ), + "enable_automatic_punctuation": optional_params.get( + "enable_automatic_punctuation", True + ), + "enable_word_time_offsets": bool( + optional_params.get("enable_word_time_offsets", False) + ), + "max_alternatives": optional_params.get("max_alternatives", 1), + "model": optional_params.get("riva_model_name", ""), + "verbatim_transcripts": optional_params.get("verbatim_transcripts", False), + "profanity_filter": optional_params.get("profanity_filter", False), + } + + def _build_endpointing_config_dict( + self, optional_params: dict + ) -> Optional[Dict[str, Any]]: + """ + Translate an OpenAI-style ``chunking_strategy`` into Riva's + ``EndpointingConfig`` shape, or pass through an explicit + ``endpointing_config`` dict. + + Returns ``None`` when neither is provided so Riva uses its built-in + VAD defaults. + """ + explicit = optional_params.get("endpointing_config") + if isinstance(explicit, dict): + return dict(explicit) + + chunking = optional_params.get("chunking_strategy") + if chunking in (None, "auto"): + return None + + if isinstance(chunking, dict) and chunking.get("type") == "server_vad": + config: Dict[str, Any] = {} + if "threshold" in chunking: + threshold = float(chunking["threshold"]) + config["start_threshold"] = threshold + config["stop_threshold"] = threshold + if "silence_duration_ms" in chunking: + config["stop_history"] = int(chunking["silence_duration_ms"]) + if "prefix_padding_ms" in chunking: + config["stop_history_eou"] = int(chunking["prefix_padding_ms"]) + return config or None + + return None + + @staticmethod + def _normalize_language_code(language: str) -> str: + """ + OpenAI accepts bare ISO-639 codes like ``en``; Riva requires BCP-47 + like ``en-US``. Normalize the most common bare codes; pass through + anything that already looks like BCP-47. + """ + if not isinstance(language, str) or not language: + return "en-US" + if "-" in language: + return language + bare_to_bcp47 = { + "en": "en-US", + "es": "es-ES", + "de": "de-DE", + "fr": "fr-FR", + "it": "it-IT", + "pt": "pt-BR", + "ja": "ja-JP", + "ko": "ko-KR", + "zh": "zh-CN", + "ru": "ru-RU", + "hi": "hi-IN", + "ar": "ar-SA", + } + return bare_to_bcp47.get(language.lower(), language) + + @staticmethod + def build_transcription_response( + final_results: List[Dict[str, Any]], + response_format: str, + duration_seconds: Optional[float], + timestamp_granularities: Optional[List[str]], + ) -> TranscriptionResponse: + """ + Aggregate a list of normalized "final result" dicts into a + ``TranscriptionResponse`` shaped for the requested ``response_format``. + + Each entry in ``final_results`` is expected to look like:: + + { + "transcript": str, + "words": [{"word": str, "start_time_ms": int, "end_time_ms": int}, ...], + } + + which the handler produces by walking the gRPC stream and keeping + only ``result.is_final`` entries (empty/non-final chunks are + ignored). + """ + full_transcript = "".join( + (item.get("transcript") or "") for item in final_results + ).strip() + + response = TranscriptionResponse(text=full_transcript) + response["task"] = "transcribe" + + if response_format == "verbose_json": + words: List[Dict[str, Any]] = [] + if timestamp_granularities and "word" in timestamp_granularities: + for item in final_results: + for word in item.get("words", []) or []: + words.append( + { + "word": word.get("word", ""), + "start": (float(word.get("start_time_ms", 0)) / 1000.0), + "end": float(word.get("end_time_ms", 0)) / 1000.0, + } + ) + if words: + response["words"] = words + if duration_seconds is not None: + response["duration"] = duration_seconds + + return response diff --git a/litellm/llms/nvidia_riva/common_utils.py b/litellm/llms/nvidia_riva/common_utils.py new file mode 100644 index 0000000000..a3071cf706 --- /dev/null +++ b/litellm/llms/nvidia_riva/common_utils.py @@ -0,0 +1,92 @@ +""" +Common utilities and exceptions for the NVIDIA Riva STT provider +""" + +from typing import Any, Optional + +from litellm.llms.base_llm.chat.transformation import BaseLLMException + + +class NvidiaRivaException(BaseLLMException): + """ + Exception raised for NVIDIA Riva (gRPC) errors. + + ``status_code`` is an HTTP-equivalent code derived from the underlying + gRPC ``StatusCode`` (when available) so that litellm's existing error + classifiers (RateLimitError, AuthenticationError, etc.) keep working. + """ + + pass + + +# Mapping from grpc.StatusCode.name -> equivalent HTTP status code. +# Kept as a plain dict (rather than importing grpc enums) so this module is +# importable without grpc installed. +_GRPC_STATUS_CODE_TO_HTTP: dict = { + "OK": 200, + "CANCELLED": 499, + "UNKNOWN": 500, + "INVALID_ARGUMENT": 400, + "DEADLINE_EXCEEDED": 504, + "NOT_FOUND": 404, + "ALREADY_EXISTS": 409, + "PERMISSION_DENIED": 403, + "RESOURCE_EXHAUSTED": 429, + "FAILED_PRECONDITION": 400, + "ABORTED": 409, + "OUT_OF_RANGE": 400, + "UNIMPLEMENTED": 501, + "INTERNAL": 500, + "UNAVAILABLE": 503, + "DATA_LOSS": 500, + "UNAUTHENTICATED": 401, +} + + +def _extract_grpc_status_name(error: Any) -> Optional[str]: + """ + Best-effort extraction of a gRPC StatusCode name from an arbitrary error. + + Works for ``grpc.RpcError`` instances (which expose ``.code()``) as well + as plain exceptions whose string representation contains a status name. + """ + code_fn = getattr(error, "code", None) + if callable(code_fn): + try: + code = code_fn() + except Exception: + code = None + name = getattr(code, "name", None) + if isinstance(name, str): + return name + return None + + +def _extract_grpc_details(error: Any) -> Optional[str]: + """Best-effort extraction of a human-readable detail string from a gRPC error.""" + details_fn = getattr(error, "details", None) + if callable(details_fn): + try: + details = details_fn() + except Exception: + details = None + if isinstance(details, str) and details: + return details + return None + + +def grpc_error_to_litellm_exception(error: Exception) -> NvidiaRivaException: + """ + Convert a gRPC error (or any exception raised from the Riva client) into + a ``NvidiaRivaException`` with an appropriate HTTP-equivalent status code. + """ + status_name = _extract_grpc_status_name(error) + http_status = _GRPC_STATUS_CODE_TO_HTTP.get(status_name or "", 500) + + detail = _extract_grpc_details(error) or str(error) + message = ( + f"NVIDIA Riva gRPC error ({status_name}): {detail}" + if status_name + else f"NVIDIA Riva error: {detail}" + ) + return NvidiaRivaException(status_code=http_status, message=message) diff --git a/litellm/main.py b/litellm/main.py index 0553cf9d42..3e31ab04ee 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -211,6 +211,12 @@ from .llms.oobabooga.chat import oobabooga from .llms.openai.completion.handler import OpenAITextCompletion from .llms.openai.image_variations.handler import OpenAIImageVariationsHandler from .llms.openai.openai import OpenAIChatCompletion +from .llms.nvidia_riva.audio_transcription.handler import ( + NvidiaRivaAudioTranscription, +) +from .llms.nvidia_riva.audio_transcription.transformation import ( + NvidiaRivaAudioTranscriptionConfig, +) from .llms.openai.transcriptions.handler import OpenAIAudioTranscription from .llms.openai_like.chat.handler import OpenAILikeChatHandler from .llms.openai_like.embedding.handler import OpenAILikeEmbeddingHandler @@ -266,6 +272,7 @@ from .types.utils import ( openai_chat_completions = OpenAIChatCompletion() openai_text_completions = OpenAITextCompletion() openai_audio_transcriptions = OpenAIAudioTranscription() +nvidia_riva_audio_transcriptions = NvidiaRivaAudioTranscription() openai_image_variations = OpenAIImageVariationsHandler() groq_chat_completions = GroqChatCompletion() sap_gen_ai_hub_chat_completions = GenAIHubOrchestration() @@ -6605,6 +6612,26 @@ def transcription( litellm_params=litellm_params_dict, shared_session=shared_session, ) + elif custom_llm_provider == "nvidia_riva": + # NVIDIA Riva is gRPC-based, not HTTP. It has its own dedicated handler + # rather than going through base_llm_http_handler. + response = nvidia_riva_audio_transcriptions.audio_transcriptions( + model=model, + audio_file=file, + optional_params=optional_params, + litellm_params=litellm_params_dict, + model_response=model_response, + atranscription=atranscription, + timeout=timeout, + logging_obj=litellm_logging_obj, + api_base=api_base, + api_key=api_key, + provider_config=( + provider_config + if isinstance(provider_config, NvidiaRivaAudioTranscriptionConfig) + else None + ), + ) elif provider_config is not None: response = base_llm_http_handler.audio_transcriptions( model=model, diff --git a/litellm/types/utils.py b/litellm/types/utils.py index c05c46e0d4..00a7748309 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -3247,6 +3247,7 @@ class LlmProviders(str, Enum): A2A = "a2a" GIGACHAT = "gigachat" NVIDIA_NIM = "nvidia_nim" + NVIDIA_RIVA = "nvidia_riva" CEREBRAS = "cerebras" AI21_CHAT = "ai21_chat" VOLCENGINE = "volcengine" diff --git a/litellm/utils.py b/litellm/utils.py index 0c5b694bd7..019fbc2add 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8545,6 +8545,12 @@ class ProviderConfigManager: ) return MistralAudioTranscriptionConfig() + elif litellm.LlmProviders.NVIDIA_RIVA == provider: + from litellm.llms.nvidia_riva.audio_transcription.transformation import ( + NvidiaRivaAudioTranscriptionConfig, + ) + + return NvidiaRivaAudioTranscriptionConfig() return None @staticmethod diff --git a/provider_endpoints_support.json b/provider_endpoints_support.json index 3fc7cd4318..1d577213a1 100644 --- a/provider_endpoints_support.json +++ b/provider_endpoints_support.json @@ -1610,6 +1610,22 @@ "interactions": true } }, + "nvidia_riva": { + "display_name": "Nvidia Riva (`nvidia_riva`)", + "url": "https://docs.litellm.ai/docs/providers/nvidia_riva", + "endpoints": { + "chat_completions": false, + "messages": false, + "responses": false, + "embeddings": false, + "image_generations": false, + "audio_transcriptions": true, + "audio_speech": false, + "moderations": false, + "batches": false, + "rerank": false + } + }, "oci": { "display_name": "OCI (`oci`)", "url": "https://docs.litellm.ai/docs/providers/oci", diff --git a/pyproject.toml b/pyproject.toml index 8445a5a60f..7ff388f184 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,14 @@ grpc = [ # Newest non-yanked release older than the 30-day cutoff. "grpcio==1.78.0", ] +stt-nvidia-riva = [ + # NVIDIA Riva STT provider (gRPC). These are imported lazily inside the + # provider handler so litellm core remains usable without them. + "nvidia-riva-client>=2.15.0", + "soundfile>=0.12.1", + "audioread>=3.0.1", + "numpy>=1.26.0", +] google = ["google-cloud-aiplatform==1.133.0"] proxy-runtime = [ # Historically bundled in the proxy Docker images via requirements.txt. diff --git a/tests/code_coverage_tests/liccheck.ini b/tests/code_coverage_tests/liccheck.ini index 2100aa1377..5a09403c57 100644 --- a/tests/code_coverage_tests/liccheck.ini +++ b/tests/code_coverage_tests/liccheck.ini @@ -126,6 +126,7 @@ sentry_sdk: >=2.21.0 # Unknown license cryptography: >=43.0.1 # Unknown license tzdata: >=2025.1 # Unknown license urllib3: >=2.0.0 # MIT license - https://github.com/urllib3/urllib3 +audioread: >=3.0.1 # MIT license manually verified - https://github.com/beetbox/audioread python-dotenv: >=1.0.0 # Unknown license tiktoken: >=0.8.0 # Unknown license click: >=8.1.7 # Unknown license diff --git a/tests/test_litellm/llms/nvidia_riva/__init__.py b/tests/test_litellm/llms/nvidia_riva/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_litellm/llms/nvidia_riva/audio_transcription/__init__.py b/tests/test_litellm/llms/nvidia_riva/audio_transcription/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_audio_utils.py b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_audio_utils.py new file mode 100644 index 0000000000..0e355b91ca --- /dev/null +++ b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_audio_utils.py @@ -0,0 +1,130 @@ +""" +Tests for the NVIDIA Riva audio resampling utility. + +The resampler turns arbitrary inbound audio (mp3/wav/m4a/...) into the wire +format Riva's gRPC ASR expects: 16 kHz mono LINEAR_PCM (int16 LE). +""" + +import io +import os +import sys +from types import SimpleNamespace +from unittest.mock import MagicMock + +import numpy as np +import pytest +import soundfile as sf + +sys.path.insert(0, os.path.abspath("../../../../..")) + +from litellm.llms.nvidia_riva.audio_transcription.audio_utils import ( + resample_to_riva_pcm, +) +from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException + + +def _wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes: + buf = io.BytesIO() + sf.write(buf, samples, sample_rate, format="WAV", subtype="PCM_16") + return buf.getvalue() + + +def test_resample_24khz_stereo_to_16khz_mono_int16(): + sample_rate_in = 24000 + duration_seconds = 1.0 + n = int(sample_rate_in * duration_seconds) + t = np.linspace(0, duration_seconds, n, endpoint=False) + left = 0.5 * np.sin(2 * np.pi * 440.0 * t) + right = 0.5 * np.sin(2 * np.pi * 660.0 * t) + stereo = np.stack([left, right], axis=1).astype(np.float32) + + wav_in = _wav_bytes(stereo, sample_rate_in) + + resampled = resample_to_riva_pcm(wav_in) + + assert resampled.sample_rate_hz == 16000 + assert resampled.num_channels == 1 + # int16 = 2 bytes per sample + expected_samples = int(round(duration_seconds * 16000)) + assert len(resampled.pcm_bytes) == expected_samples * 2 + assert resampled.duration_seconds == pytest.approx(duration_seconds, abs=0.005) + + +def test_resample_16khz_mono_passes_through_int16_bytes_match_length(): + sample_rate = 16000 + n = sample_rate + samples = (0.1 * np.sin(np.linspace(0, 2 * np.pi * 200, n))).astype(np.float32) + wav_in = _wav_bytes(samples, sample_rate) + + resampled = resample_to_riva_pcm(wav_in) + + assert resampled.sample_rate_hz == 16000 + assert len(resampled.pcm_bytes) == n * 2 + assert resampled.duration_seconds == pytest.approx(1.0, abs=0.001) + + +def test_resample_preserves_int16_clip_range(): + sample_rate = 16000 + samples = np.array([2.0, -2.0, 0.0, 1.0], dtype=np.float32) + wav_in = _wav_bytes(samples, sample_rate) + + resampled = resample_to_riva_pcm(wav_in) + + decoded = np.frombuffer(resampled.pcm_bytes, dtype="= -32767 + + +def test_unknown_format_raises_clear_error(): + # 4 random bytes are not valid audio in any container we can decode. + with pytest.raises(NvidiaRivaException) as excinfo: + resample_to_riva_pcm(b"\x00\x01\x02\x03") + # Message must hint at what to do next. + assert "Riva STT" in excinfo.value.message + + +def test_audioread_fallback_writes_to_tempfile_path(monkeypatch): + """ + The audioread fallback handles compressed formats (mp3, m4a, ...). Most + audioread backends call into a subprocess (FFmpeg, GStreamer) and + require a real filesystem path — passing a BytesIO blows up with a + TypeError in subprocess.Popen. This test would have caught that bug: + we assert ``audio_open`` is called with a string path that points at a + file containing exactly the input bytes. + """ + payload = b"\xff\xfbfake-mp3-bytes-not-actually-decodable" + seen_paths = [] + + class FakeAudioSource: + samplerate = 22050 + channels = 1 + + def __iter__(self): + yield np.array([0, 0, 0, 0], dtype=np.int16).tobytes() + + def __enter__(self): + return self + + def __exit__(self, *args): + return False + + def fake_audio_open(path): + assert isinstance(path, str), "audioread requires a filesystem path" + seen_paths.append(path) + with open(path, "rb") as fh: + assert fh.read() == payload + return FakeAudioSource() + + fake_audioread = SimpleNamespace(audio_open=fake_audio_open) + monkeypatch.setitem(sys.modules, "audioread", fake_audioread) + + fake_sf = MagicMock() + fake_sf.read.side_effect = RuntimeError("libsndfile cannot decode mp3") + monkeypatch.setitem(sys.modules, "soundfile", fake_sf) + + resampled = resample_to_riva_pcm(payload) + assert resampled.sample_rate_hz == 16000 + assert seen_paths and seen_paths[0].endswith(".audio") + # Tempfile must be cleaned up after decode. + assert not os.path.exists(seen_paths[0]) diff --git a/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_handler.py b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_handler.py new file mode 100644 index 0000000000..341a0e77ce --- /dev/null +++ b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_handler.py @@ -0,0 +1,419 @@ +""" +End-to-end-ish tests for NvidiaRivaAudioTranscription. + +We mock ``riva.client`` so the test does not need the real gRPC SDK or a +running Riva server. The mock also lets us assert how Auth metadata is +constructed (NVCF vs self-hosted) and how the streaming generator output +is aggregated. +""" + +import asyncio +import io +import os +import sys +from types import SimpleNamespace +from unittest.mock import MagicMock + +import numpy as np +import pytest +import soundfile as sf + +sys.path.insert(0, os.path.abspath("../../../../..")) + +from litellm.llms.nvidia_riva.audio_transcription import handler as handler_mod +from litellm.llms.nvidia_riva.audio_transcription.handler import ( + NvidiaRivaAudioTranscription, +) +from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException +from litellm.types.utils import TranscriptionResponse + + +def _make_wav_bytes(seconds: float = 1.0, sample_rate: int = 16000) -> bytes: + n = int(sample_rate * seconds) + samples = (0.05 * np.sin(np.linspace(0, 2 * np.pi * 220 * seconds, n))).astype( + np.float32 + ) + buf = io.BytesIO() + sf.write(buf, samples, sample_rate, format="WAV", subtype="PCM_16") + return buf.getvalue() + + +def _fake_word(word: str, start_ms: int, end_ms: int): + return SimpleNamespace(word=word, start_time=start_ms, end_time=end_ms) + + +def _fake_alternative(transcript: str, words=None): + return SimpleNamespace(transcript=transcript, words=words or []) + + +def _fake_result(is_final: bool, alternatives): + return SimpleNamespace(is_final=is_final, alternatives=alternatives) + + +def _fake_response(results): + return SimpleNamespace(results=results) + + +@pytest.fixture +def mock_riva(monkeypatch): + """ + Stand-ins for the bits of ``riva.client`` the handler touches: + - ``Auth`` (constructor) + - ``ASRService`` with ``streaming_response_generator`` + - ``RecognitionConfig``, ``StreamingRecognitionConfig``, ``EndpointingConfig`` + - ``AudioEncoding`` namespace with ``LINEAR_PCM`` + """ + auth_calls = {} + + class FakeAuth: + def __init__(self, *args, **kwargs): + # Support both keyword and positional Auth constructors. + if kwargs: + auth_calls["uri"] = kwargs.get("uri") + auth_calls["use_ssl"] = kwargs.get("use_ssl") + auth_calls["metadata_args"] = kwargs.get("metadata_args") + else: + # positional: (None, use_ssl, uri, metadata) + auth_calls["use_ssl"] = args[1] if len(args) > 1 else None + auth_calls["uri"] = args[2] if len(args) > 2 else None + auth_calls["metadata_args"] = args[3] if len(args) > 3 else None + + class FakeRecognitionConfig: + def __init__(self, **kwargs): + self._kwargs = kwargs + self.endpointing_config = SimpleNamespace(CopyFrom=lambda _: None) + + class FakeStreamingRecognitionConfig: + def __init__(self, config, interim_results): + self.config = config + self.interim_results = interim_results + + class FakeEndpointingConfig: + def __init__(self, **kwargs): + self._kwargs = kwargs + + class FakeAudioEncoding: + LINEAR_PCM = "LINEAR_PCM" + + streaming_responses_holder = {"value": []} + + class FakeASRService: + def __init__(self, auth): + self.auth = auth + + def streaming_response_generator(self, audio_chunks, streaming_config): + # Drain audio_chunks generator so we exercise the chunking path. + list(audio_chunks) + yield from streaming_responses_holder["value"] + + fake_riva_client = SimpleNamespace( + Auth=FakeAuth, + ASRService=FakeASRService, + RecognitionConfig=FakeRecognitionConfig, + StreamingRecognitionConfig=FakeStreamingRecognitionConfig, + EndpointingConfig=FakeEndpointingConfig, + AudioEncoding=FakeAudioEncoding, + ) + + def fake_import_riva(): + return fake_riva_client, fake_riva_client + + monkeypatch.setattr(handler_mod, "_import_riva", fake_import_riva) + + return SimpleNamespace( + auth_calls=auth_calls, + responses=streaming_responses_holder, + client=fake_riva_client, + ) + + +@pytest.fixture +def logging_obj(): + return MagicMock() + + +def test_sync_path_aggregates_only_final_results(mock_riva, logging_obj): + mock_riva.responses["value"] = [ + # Empty heartbeat chunk: ignore. + _fake_response(results=[]), + # Interim chunk (not final): ignore. + _fake_response( + results=[ + _fake_result( + is_final=False, alternatives=[_fake_alternative("partial...")] + ) + ] + ), + # Two final chunks aggregated. + _fake_response( + results=[ + _fake_result( + is_final=True, + alternatives=[ + _fake_alternative( + "Hello,", + words=[_fake_word("Hello,", 0, 320)], + ) + ], + ) + ] + ), + _fake_response( + results=[ + _fake_result( + is_final=True, + alternatives=[ + _fake_alternative( + " world.", + words=[_fake_word("world.", 480, 870)], + ) + ], + ) + ] + ), + ] + + impl = NvidiaRivaAudioTranscription() + response: TranscriptionResponse = impl.audio_transcriptions( + model="nvidia/parakeet-ctc-1_1b-asr", + audio_file=_make_wav_bytes(), + optional_params={ + "language_code": "en-US", + "enable_word_time_offsets": True, + "response_format": "verbose_json", + "timestamp_granularities": ["word"], + }, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key="nvapi-xxx", + api_base="grpc.nvcf.nvidia.com:443", + ) + + assert response.text == "Hello, world." + # duration is propagated from the resampler. + assert response._hidden_params["audio_transcription_duration"] == pytest.approx( + 1.0, abs=0.05 + ) + # word timestamps converted from ms to seconds. + words = response["words"] + assert words[0]["start"] == pytest.approx(0.0) + assert words[1]["end"] == pytest.approx(0.87) + assert ( + logging_obj.pre_call.call_args.kwargs["additional_args"]["atranscription"] + is False + ) + + +def test_auth_nvcf_defaults_use_ssl_and_attaches_function_id(mock_riva, logging_obj): + mock_riva.responses["value"] = [ + _fake_response( + results=[ + _fake_result( + is_final=True, + alternatives=[_fake_alternative("ok")], + ) + ] + ) + ] + impl = NvidiaRivaAudioTranscription() + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={ + "nvcf_function_id": "abc-123", + "language_code": "en-US", + }, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key="nvapi-xxx", + api_base="grpc.nvcf.nvidia.com:443", + ) + + assert mock_riva.auth_calls["uri"] == "grpc.nvcf.nvidia.com:443" + assert mock_riva.auth_calls["use_ssl"] is True + metadata = dict(mock_riva.auth_calls["metadata_args"]) + assert metadata["function-id"] == "abc-123" + assert metadata["authorization"] == "Bearer nvapi-xxx" + + +def test_auth_self_hosted_defaults_no_ssl_and_no_function_id(mock_riva, logging_obj): + mock_riva.responses["value"] = [ + _fake_response( + results=[ + _fake_result(is_final=True, alternatives=[_fake_alternative("ok")]) + ] + ) + ] + impl = NvidiaRivaAudioTranscription() + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={"language_code": "en-US"}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key=None, + api_base="localhost:50051", + ) + + assert mock_riva.auth_calls["uri"] == "localhost:50051" + assert mock_riva.auth_calls["use_ssl"] is False + metadata = dict(mock_riva.auth_calls["metadata_args"]) + # No function-id, no authorization metadata. + assert "function-id" not in metadata + assert "authorization" not in metadata + + +def test_explicit_use_ssl_override_wins(mock_riva, logging_obj): + """ + Self-hosted Riva behind an ingress with TLS termination is a real + deployment topology. ``use_ssl=True`` must be honored even without an + NVCF function id. + """ + mock_riva.responses["value"] = [ + _fake_response( + results=[ + _fake_result(is_final=True, alternatives=[_fake_alternative("ok")]) + ] + ) + ] + impl = NvidiaRivaAudioTranscription() + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={"use_ssl": True, "language_code": "en-US"}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key=None, + api_base="riva.internal.company.com:443", + ) + + assert mock_riva.auth_calls["use_ssl"] is True + + +def test_missing_api_base_raises_clear_error(mock_riva, logging_obj): + impl = NvidiaRivaAudioTranscription() + with pytest.raises(NvidiaRivaException) as excinfo: + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key=None, + api_base=None, + ) + assert "api_base" in excinfo.value.message + + +def test_async_path_uses_to_thread(mock_riva, logging_obj): + mock_riva.responses["value"] = [ + _fake_response( + results=[ + _fake_result( + is_final=True, alternatives=[_fake_alternative("async ok")] + ) + ] + ) + ] + impl = NvidiaRivaAudioTranscription() + response = asyncio.run( + impl.async_audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={"language_code": "en-US"}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key=None, + api_base="localhost:50051", + ) + ) + assert response.text == "async ok" + assert ( + logging_obj.pre_call.call_args.kwargs["additional_args"]["atranscription"] + is True + ) + + +def test_timeout_is_forwarded_to_streaming_generator_when_supported( + mock_riva, logging_obj +): + """ + Without a deadline the gRPC stream can block forever on a stalled Riva + server. The handler must forward the call-level ``timeout`` to + ``streaming_response_generator`` whenever the installed riva-client + accepts a ``timeout`` kwarg. + """ + captured_kwargs = {} + + def streaming_with_timeout(self, audio_chunks, streaming_config, timeout=None): + captured_kwargs["timeout"] = timeout + list(audio_chunks) + yield from [ + _fake_response( + results=[ + _fake_result(is_final=True, alternatives=[_fake_alternative("ok")]) + ] + ) + ] + + mock_riva.client.ASRService.streaming_response_generator = streaming_with_timeout + + impl = NvidiaRivaAudioTranscription() + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={"language_code": "en-US"}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=12.5, + logging_obj=logging_obj, + api_key=None, + api_base="localhost:50051", + ) + assert captured_kwargs["timeout"] == pytest.approx(12.5) + + +def test_grpc_error_is_wrapped_as_nvidia_riva_exception(mock_riva, logging_obj): + class FakeGrpcError(Exception): + def code(self): + return SimpleNamespace(name="UNAUTHENTICATED") + + def details(self): + return "bad token" + + def raising_streaming_response_generator(self, audio_chunks, streaming_config): + list(audio_chunks) + raise FakeGrpcError("rpc fail") + + mock_riva.client.ASRService.streaming_response_generator = ( + raising_streaming_response_generator + ) + + impl = NvidiaRivaAudioTranscription() + with pytest.raises(NvidiaRivaException) as excinfo: + impl.audio_transcriptions( + model="m", + audio_file=_make_wav_bytes(), + optional_params={"language_code": "en-US"}, + litellm_params={}, + model_response=TranscriptionResponse(), + timeout=60, + logging_obj=logging_obj, + api_key="nvapi-xxx", + api_base="grpc.nvcf.nvidia.com:443", + ) + + assert excinfo.value.status_code == 401 + assert "UNAUTHENTICATED" in excinfo.value.message diff --git a/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_transformation.py b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_transformation.py new file mode 100644 index 0000000000..c4cca8490b --- /dev/null +++ b/tests/test_litellm/llms/nvidia_riva/audio_transcription/test_transformation.py @@ -0,0 +1,275 @@ +""" +Unit tests for NvidiaRivaAudioTranscriptionConfig. + +These tests do not require ``nvidia-riva-client`` or any audio libs to be +installed; the transformation layer is intentionally pure-Python on dicts. +""" + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.abspath("../../../../..")) + +from litellm.llms.base_llm.audio_transcription.transformation import ( + AudioTranscriptionRequestData, +) +from litellm.llms.nvidia_riva.audio_transcription.transformation import ( + NvidiaRivaAudioTranscriptionConfig, +) +from litellm.llms.nvidia_riva.common_utils import NvidiaRivaException + + +@pytest.fixture +def cfg(): + return NvidiaRivaAudioTranscriptionConfig() + + +def test_supported_openai_params(cfg): + params = cfg.get_supported_openai_params(model="nvidia/parakeet-ctc-1_1b-asr") + assert "language" in params + assert "response_format" in params + assert "timestamp_granularities" in params + + +def test_map_language_normalizes_bare_codes(cfg): + out = cfg.map_openai_params( + non_default_params={"language": "en"}, + optional_params={}, + model="m", + drop_params=False, + ) + assert out["language_code"] == "en-US" + + +def test_map_language_passes_through_bcp47(cfg): + out = cfg.map_openai_params( + non_default_params={"language": "de-DE"}, + optional_params={}, + model="m", + drop_params=False, + ) + assert out["language_code"] == "de-DE" + + +def test_map_language_es_defaults_to_castilian_spain(cfg): + """ + Bare ``es`` is ISO-639 Spanish; in BCP-47 it conventionally resolves to + es-ES (Castilian / Spain), not es-US. Routing every Spanish caller to a + US-tuned Riva model would silently degrade accuracy. + """ + out = cfg.map_openai_params( + non_default_params={"language": "es"}, + optional_params={}, + model="m", + drop_params=False, + ) + assert out["language_code"] == "es-ES" + + +def test_map_timestamp_granularities_word_enables_word_offsets(cfg): + out = cfg.map_openai_params( + non_default_params={"timestamp_granularities": ["word"]}, + optional_params={}, + model="m", + drop_params=False, + ) + assert out["enable_word_time_offsets"] is True + assert out["timestamp_granularities"] == ["word"] + + +def test_map_timestamp_granularities_segment_only_does_not_enable_word_offsets(cfg): + out = cfg.map_openai_params( + non_default_params={"timestamp_granularities": ["segment"]}, + optional_params={}, + model="m", + drop_params=False, + ) + assert "enable_word_time_offsets" not in out + + +def test_transform_request_builds_recognition_config(cfg): + result = cfg.transform_audio_transcription_request( + model="nvidia/parakeet-ctc-1_1b-asr", + audio_file=b"fake-audio", + optional_params={ + "language_code": "en-US", + "enable_word_time_offsets": True, + "nvcf_function_id": "abc-123", + "use_ssl": True, + "riva_model_name": "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer", + }, + litellm_params={ + "api_base": "grpc.nvcf.nvidia.com:443", + "api_key": "nvapi-xxx", + }, + ) + + assert isinstance(result, AudioTranscriptionRequestData) + payload = result.data + assert payload["recognition_config"]["language_code"] == "en-US" + assert payload["recognition_config"]["sample_rate_hertz"] == 16000 + assert payload["recognition_config"]["audio_channel_count"] == 1 + assert payload["recognition_config"]["encoding"] == "LINEAR_PCM" + assert payload["recognition_config"]["enable_word_time_offsets"] is True + assert ( + payload["recognition_config"]["model"] + == "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer" + ) + assert "audio_file" not in payload + assert "auth" not in payload + + +def test_transform_request_default_riva_model_is_empty_for_auto_select(cfg): + """ + Riva auto-selects the deployed model when ``model`` is empty. This is + the right default because internal NVIDIA deployment names change + across versions/regions. + """ + result = cfg.transform_audio_transcription_request( + model="nvidia/parakeet-ctc-1_1b-asr", + audio_file=b"fake-audio", + optional_params={"language_code": "en-US"}, + litellm_params={"api_base": "grpc.nvcf.nvidia.com:443"}, + ) + assert result.data["recognition_config"]["model"] == "" + + +def test_chunking_strategy_server_vad_maps_to_endpointing_config(cfg): + result = cfg.transform_audio_transcription_request( + model="m", + audio_file=b"x", + optional_params={ + "chunking_strategy": { + "type": "server_vad", + "threshold": 0.5, + "silence_duration_ms": 700, + "prefix_padding_ms": 250, + } + }, + litellm_params={"api_base": "localhost:50051"}, + ) + ep = result.data["recognition_config"].get("endpointing_config") + assert ep is not None + assert ep["start_threshold"] == 0.5 + assert ep["stop_threshold"] == 0.5 + assert ep["stop_history"] == 700 + assert ep["stop_history_eou"] == 250 + + +def test_chunking_strategy_auto_leaves_endpointing_config_unset(cfg): + result = cfg.transform_audio_transcription_request( + model="m", + audio_file=b"x", + optional_params={"chunking_strategy": "auto"}, + litellm_params={"api_base": "localhost:50051"}, + ) + assert "endpointing_config" not in result.data["recognition_config"] + + +def test_explicit_endpointing_config_pass_through(cfg): + result = cfg.transform_audio_transcription_request( + model="m", + audio_file=b"x", + optional_params={ + "endpointing_config": {"stop_history": 1200, "start_threshold": 0.3} + }, + litellm_params={"api_base": "localhost:50051"}, + ) + ep = result.data["recognition_config"]["endpointing_config"] + assert ep == {"stop_history": 1200, "start_threshold": 0.3} + + +def test_build_transcription_response_text_format(): + final_results = [ + {"transcript": "Hello,", "words": []}, + {"transcript": " this is parakeet.", "words": []}, + ] + response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response( + final_results=final_results, + response_format="json", + duration_seconds=2.4, + timestamp_granularities=None, + ) + assert response.text == "Hello, this is parakeet." + assert response["task"] == "transcribe" + # duration is only attached for verbose_json + assert "duration" not in response + + +def test_build_transcription_response_skips_empty_chunks(): + final_results = [ + {"transcript": "", "words": []}, + {"transcript": "actual content", "words": []}, + {"transcript": "", "words": []}, + ] + response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response( + final_results=final_results, + response_format="json", + duration_seconds=1.0, + timestamp_granularities=None, + ) + assert response.text == "actual content" + + +def test_build_transcription_response_verbose_json_with_words(): + final_results = [ + { + "transcript": "Hello,", + "words": [ + {"word": "Hello,", "start_time_ms": 0, "end_time_ms": 320}, + ], + }, + { + "transcript": " world.", + "words": [ + {"word": "world.", "start_time_ms": 480, "end_time_ms": 870}, + ], + }, + ] + response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response( + final_results=final_results, + response_format="verbose_json", + duration_seconds=2.475, + timestamp_granularities=["word"], + ) + + assert response.text == "Hello, world." + assert response["duration"] == 2.475 + words = response["words"] + assert words[0]["word"] == "Hello," + # Riva returns ms; OpenAI exposes seconds. + assert words[0]["start"] == pytest.approx(0.0) + assert words[0]["end"] == pytest.approx(0.32) + assert words[1]["start"] == pytest.approx(0.48) + assert words[1]["end"] == pytest.approx(0.87) + + +def test_build_transcription_response_verbose_json_without_word_granularity_omits_words(): + final_results = [ + { + "transcript": "Hi.", + "words": [ + {"word": "Hi.", "start_time_ms": 0, "end_time_ms": 200}, + ], + } + ] + response = NvidiaRivaAudioTranscriptionConfig.build_transcription_response( + final_results=final_results, + response_format="verbose_json", + duration_seconds=0.2, + timestamp_granularities=["segment"], + ) + assert "words" not in response + + +def test_transform_response_not_used_raises_clear_error(cfg): + with pytest.raises(NotImplementedError): + cfg.transform_audio_transcription_response(raw_response=None) # type: ignore[arg-type] + + +def test_get_error_class_returns_nvidia_riva_exception(cfg): + err = cfg.get_error_class(error_message="bad", status_code=401, headers={}) + assert isinstance(err, NvidiaRivaException) + assert err.status_code == 401 diff --git a/uv.lock b/uv.lock index 8168b516f2..eed58c76bc 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. +exclude-newer = "2026-05-02T11:18:44.200141Z" exclude-newer-span = "P3D" [manifest] @@ -339,6 +339,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, ] +[[package]] +name = "audioop-lts" +version = "0.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/53/946db57842a50b2da2e0c1e34bd37f36f5aadba1a929a3971c5d7841dbca/audioop_lts-0.2.2.tar.gz", hash = "sha256:64d0c62d88e67b98a1a5e71987b7aa7b5bcffc7dcee65b635823dbdd0a8dbbd0", size = 30686, upload-time = "2025-08-05T16:43:17.409Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/d4/94d277ca941de5a507b07f0b592f199c22454eeaec8f008a286b3fbbacd6/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd3d4602dc64914d462924a08c1a9816435a2155d74f325853c1f1ac3b2d9800", size = 46523, upload-time = "2025-08-05T16:42:20.836Z" }, + { url = "https://files.pythonhosted.org/packages/f8/5a/656d1c2da4b555920ce4177167bfeb8623d98765594af59702c8873f60ec/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:550c114a8df0aafe9a05442a1162dfc8fec37e9af1d625ae6060fed6e756f303", size = 27455, upload-time = "2025-08-05T16:42:22.283Z" }, + { url = "https://files.pythonhosted.org/packages/1b/83/ea581e364ce7b0d41456fb79d6ee0ad482beda61faf0cab20cbd4c63a541/audioop_lts-0.2.2-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:9a13dc409f2564de15dd68be65b462ba0dde01b19663720c68c1140c782d1d75", size = 26997, upload-time = "2025-08-05T16:42:23.849Z" }, + { url = "https://files.pythonhosted.org/packages/b8/3b/e8964210b5e216e5041593b7d33e97ee65967f17c282e8510d19c666dab4/audioop_lts-0.2.2-cp313-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:51c916108c56aa6e426ce611946f901badac950ee2ddaf302b7ed35d9958970d", size = 85844, upload-time = "2025-08-05T16:42:25.208Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2e/0a1c52faf10d51def20531a59ce4c706cb7952323b11709e10de324d6493/audioop_lts-0.2.2-cp313-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:47eba38322370347b1c47024defbd36374a211e8dd5b0dcbce7b34fdb6f8847b", size = 85056, upload-time = "2025-08-05T16:42:26.559Z" }, + { url = "https://files.pythonhosted.org/packages/75/e8/cd95eef479656cb75ab05dfece8c1f8c395d17a7c651d88f8e6e291a63ab/audioop_lts-0.2.2-cp313-abi3-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba7c3a7e5f23e215cb271516197030c32aef2e754252c4c70a50aaff7031a2c8", size = 93892, upload-time = "2025-08-05T16:42:27.902Z" }, + { url = "https://files.pythonhosted.org/packages/5c/1e/a0c42570b74f83efa5cca34905b3eef03f7ab09fe5637015df538a7f3345/audioop_lts-0.2.2-cp313-abi3-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:def246fe9e180626731b26e89816e79aae2276f825420a07b4a647abaa84becc", size = 96660, upload-time = "2025-08-05T16:42:28.9Z" }, + { url = "https://files.pythonhosted.org/packages/50/d5/8a0ae607ca07dbb34027bac8db805498ee7bfecc05fd2c148cc1ed7646e7/audioop_lts-0.2.2-cp313-abi3-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e160bf9df356d841bb6c180eeeea1834085464626dc1b68fa4e1d59070affdc3", size = 79143, upload-time = "2025-08-05T16:42:29.929Z" }, + { url = "https://files.pythonhosted.org/packages/12/17/0d28c46179e7910bfb0bb62760ccb33edb5de973052cb2230b662c14ca2e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4b4cd51a57b698b2d06cb9993b7ac8dfe89a3b2878e96bc7948e9f19ff51dba6", size = 84313, upload-time = "2025-08-05T16:42:30.949Z" }, + { url = "https://files.pythonhosted.org/packages/84/ba/bd5d3806641564f2024e97ca98ea8f8811d4e01d9b9f9831474bc9e14f9e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_ppc64le.whl", hash = "sha256:4a53aa7c16a60a6857e6b0b165261436396ef7293f8b5c9c828a3a203147ed4a", size = 93044, upload-time = "2025-08-05T16:42:31.959Z" }, + { url = "https://files.pythonhosted.org/packages/f9/5e/435ce8d5642f1f7679540d1e73c1c42d933331c0976eb397d1717d7f01a3/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_riscv64.whl", hash = "sha256:3fc38008969796f0f689f1453722a0f463da1b8a6fbee11987830bfbb664f623", size = 78766, upload-time = "2025-08-05T16:42:33.302Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3b/b909e76b606cbfd53875693ec8c156e93e15a1366a012f0b7e4fb52d3c34/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_s390x.whl", hash = "sha256:15ab25dd3e620790f40e9ead897f91e79c0d3ce65fe193c8ed6c26cffdd24be7", size = 87640, upload-time = "2025-08-05T16:42:34.854Z" }, + { url = "https://files.pythonhosted.org/packages/30/e7/8f1603b4572d79b775f2140d7952f200f5e6c62904585d08a01f0a70393a/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:03f061a1915538fd96272bac9551841859dbb2e3bf73ebe4a23ef043766f5449", size = 86052, upload-time = "2025-08-05T16:42:35.839Z" }, + { url = "https://files.pythonhosted.org/packages/b5/96/c37846df657ccdda62ba1ae2b6534fa90e2e1b1742ca8dcf8ebd38c53801/audioop_lts-0.2.2-cp313-abi3-win32.whl", hash = "sha256:3bcddaaf6cc5935a300a8387c99f7a7fbbe212a11568ec6cf6e4bc458c048636", size = 26185, upload-time = "2025-08-05T16:42:37.04Z" }, + { url = "https://files.pythonhosted.org/packages/34/a5/9d78fdb5b844a83da8a71226c7bdae7cc638861085fff7a1d707cb4823fa/audioop_lts-0.2.2-cp313-abi3-win_amd64.whl", hash = "sha256:a2c2a947fae7d1062ef08c4e369e0ba2086049a5e598fda41122535557012e9e", size = 30503, upload-time = "2025-08-05T16:42:38.427Z" }, + { url = "https://files.pythonhosted.org/packages/34/25/20d8fde083123e90c61b51afb547bb0ea7e77bab50d98c0ab243d02a0e43/audioop_lts-0.2.2-cp313-abi3-win_arm64.whl", hash = "sha256:5f93a5db13927a37d2d09637ccca4b2b6b48c19cd9eda7b17a2e9f77edee6a6f", size = 24173, upload-time = "2025-08-05T16:42:39.704Z" }, + { url = "https://files.pythonhosted.org/packages/58/a7/0a764f77b5c4ac58dc13c01a580f5d32ae8c74c92020b961556a43e26d02/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:73f80bf4cd5d2ca7814da30a120de1f9408ee0619cc75da87d0641273d202a09", size = 47096, upload-time = "2025-08-05T16:42:40.684Z" }, + { url = "https://files.pythonhosted.org/packages/aa/ed/ebebedde1a18848b085ad0fa54b66ceb95f1f94a3fc04f1cd1b5ccb0ed42/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:106753a83a25ee4d6f473f2be6b0966fc1c9af7e0017192f5531a3e7463dce58", size = 27748, upload-time = "2025-08-05T16:42:41.992Z" }, + { url = "https://files.pythonhosted.org/packages/cb/6e/11ca8c21af79f15dbb1c7f8017952ee8c810c438ce4e2b25638dfef2b02c/audioop_lts-0.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fbdd522624141e40948ab3e8cdae6e04c748d78710e9f0f8d4dae2750831de19", size = 27329, upload-time = "2025-08-05T16:42:42.987Z" }, + { url = "https://files.pythonhosted.org/packages/84/52/0022f93d56d85eec5da6b9da6a958a1ef09e80c39f2cc0a590c6af81dcbb/audioop_lts-0.2.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:143fad0311e8209ece30a8dbddab3b65ab419cbe8c0dde6e8828da25999be911", size = 92407, upload-time = "2025-08-05T16:42:44.336Z" }, + { url = "https://files.pythonhosted.org/packages/87/1d/48a889855e67be8718adbc7a01f3c01d5743c325453a5e81cf3717664aad/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfbbc74ec68a0fd08cfec1f4b5e8cca3d3cd7de5501b01c4b5d209995033cde9", size = 91811, upload-time = "2025-08-05T16:42:45.325Z" }, + { url = "https://files.pythonhosted.org/packages/98/a6/94b7213190e8077547ffae75e13ed05edc488653c85aa5c41472c297d295/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cfcac6aa6f42397471e4943e0feb2244549db5c5d01efcd02725b96af417f3fe", size = 100470, upload-time = "2025-08-05T16:42:46.468Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e9/78450d7cb921ede0cfc33426d3a8023a3bda755883c95c868ee36db8d48d/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:752d76472d9804ac60f0078c79cdae8b956f293177acd2316cd1e15149aee132", size = 103878, upload-time = "2025-08-05T16:42:47.576Z" }, + { url = "https://files.pythonhosted.org/packages/4f/e2/cd5439aad4f3e34ae1ee852025dc6aa8f67a82b97641e390bf7bd9891d3e/audioop_lts-0.2.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:83c381767e2cc10e93e40281a04852facc4cd9334550e0f392f72d1c0a9c5753", size = 84867, upload-time = "2025-08-05T16:42:49.003Z" }, + { url = "https://files.pythonhosted.org/packages/68/4b/9d853e9076c43ebba0d411e8d2aa19061083349ac695a7d082540bad64d0/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c0022283e9556e0f3643b7c3c03f05063ca72b3063291834cca43234f20c60bb", size = 90001, upload-time = "2025-08-05T16:42:50.038Z" }, + { url = "https://files.pythonhosted.org/packages/58/26/4bae7f9d2f116ed5593989d0e521d679b0d583973d203384679323d8fa85/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a2d4f1513d63c795e82948e1305f31a6d530626e5f9f2605408b300ae6095093", size = 99046, upload-time = "2025-08-05T16:42:51.111Z" }, + { url = "https://files.pythonhosted.org/packages/b2/67/a9f4fb3e250dda9e9046f8866e9fa7d52664f8985e445c6b4ad6dfb55641/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:c9c8e68d8b4a56fda8c025e538e639f8c5953f5073886b596c93ec9b620055e7", size = 84788, upload-time = "2025-08-05T16:42:52.198Z" }, + { url = "https://files.pythonhosted.org/packages/70/f7/3de86562db0121956148bcb0fe5b506615e3bcf6e63c4357a612b910765a/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:96f19de485a2925314f5020e85911fb447ff5fbef56e8c7c6927851b95533a1c", size = 94472, upload-time = "2025-08-05T16:42:53.59Z" }, + { url = "https://files.pythonhosted.org/packages/f1/32/fd772bf9078ae1001207d2df1eef3da05bea611a87dd0e8217989b2848fa/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e541c3ef484852ef36545f66209444c48b28661e864ccadb29daddb6a4b8e5f5", size = 92279, upload-time = "2025-08-05T16:42:54.632Z" }, + { url = "https://files.pythonhosted.org/packages/4f/41/affea7181592ab0ab560044632571a38edaf9130b84928177823fbf3176a/audioop_lts-0.2.2-cp313-cp313t-win32.whl", hash = "sha256:d5e73fa573e273e4f2e5ff96f9043858a5e9311e94ffefd88a3186a910c70917", size = 26568, upload-time = "2025-08-05T16:42:55.627Z" }, + { url = "https://files.pythonhosted.org/packages/28/2b/0372842877016641db8fc54d5c88596b542eec2f8f6c20a36fb6612bf9ee/audioop_lts-0.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9191d68659eda01e448188f60364c7763a7ca6653ed3f87ebb165822153a8547", size = 30942, upload-time = "2025-08-05T16:42:56.674Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ca/baf2b9cc7e96c179bb4a54f30fcd83e6ecb340031bde68f486403f943768/audioop_lts-0.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:c174e322bb5783c099aaf87faeb240c8d210686b04bd61dfd05a8e5a83d88969", size = 24603, upload-time = "2025-08-05T16:42:57.571Z" }, +] + +[[package]] +name = "audioread" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "standard-aifc", marker = "python_full_version >= '3.13'" }, + { name = "standard-sunau", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" }, +] + [[package]] name = "aurelio-sdk" version = "0.0.19" @@ -2176,6 +2229,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/58/317b0134129b556a93a3b0afe00ee675b5657f0155509e22fcb853bafe2d/grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3", size = 14424, upload-time = "2025-06-28T04:23:42.136Z" }, ] +[[package]] +name = "grpcio-tools" +version = "1.71.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio" }, + { name = "protobuf" }, + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/9a/edfefb47f11ef6b0f39eea4d8f022c5bb05ac1d14fcc7058e84a51305b73/grpcio_tools-1.71.2.tar.gz", hash = "sha256:b5304d65c7569b21270b568e404a5a843cf027c66552a6a0978b23f137679c09", size = 5330655, upload-time = "2025-06-28T04:22:00.308Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/ad/e74a4d1cffff628c2ef1ec5b9944fb098207cc4af6eb8db4bc52e6d99236/grpcio_tools-1.71.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:ab8a28c2e795520d6dc6ffd7efaef4565026dbf9b4f5270de2f3dd1ce61d2318", size = 2385557, upload-time = "2025-06-28T04:20:38.833Z" }, + { url = "https://files.pythonhosted.org/packages/63/bf/30b63418279d6fdc4fd4a3781a7976c40c7e8ee052333b9ce6bd4ce63f30/grpcio_tools-1.71.2-cp310-cp310-macosx_10_14_universal2.whl", hash = "sha256:654ecb284a592d39a85556098b8c5125163435472a20ead79b805cf91814b99e", size = 5446915, upload-time = "2025-06-28T04:20:40.947Z" }, + { url = "https://files.pythonhosted.org/packages/83/cd/2994e0a0a67714fdb00c207c4bec60b9b356fbd6b0b7a162ecaabe925155/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:b49aded2b6c890ff690d960e4399a336c652315c6342232c27bd601b3705739e", size = 2348301, upload-time = "2025-06-28T04:20:42.766Z" }, + { url = "https://files.pythonhosted.org/packages/5b/8b/4f2315927af306af1b35793b332b9ca9dc5b5a2cde2d55811c9577b5f03f/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7811a6fc1c4b4e5438e5eb98dbd52c2dc4a69d1009001c13356e6636322d41a", size = 2742159, upload-time = "2025-06-28T04:20:44.206Z" }, + { url = "https://files.pythonhosted.org/packages/8d/98/d513f6c09df405c82583e7083c20718ea615ed0da69ec42c80ceae7ebdc5/grpcio_tools-1.71.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393a9c80596aa2b3f05af854e23336ea8c295593bbb35d9adae3d8d7943672bd", size = 2473444, upload-time = "2025-06-28T04:20:45.5Z" }, + { url = "https://files.pythonhosted.org/packages/fa/fe/00af17cc841916d5e4227f11036bf443ce006629212c876937c7904b0ba3/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:823e1f23c12da00f318404c4a834bb77cd150d14387dee9789ec21b335249e46", size = 2850339, upload-time = "2025-06-28T04:20:46.758Z" }, + { url = "https://files.pythonhosted.org/packages/7d/59/745fc50dfdbed875fcfd6433883270d39d23fb1aa4ecc9587786f772dce3/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9bfbea79d6aec60f2587133ba766ede3dc3e229641d1a1e61d790d742a3d19eb", size = 3300795, upload-time = "2025-06-28T04:20:48.327Z" }, + { url = "https://files.pythonhosted.org/packages/62/3e/d9d0fb2df78e601c28d02ef0cd5d007f113c1b04fc21e72bf56e8c3df66b/grpcio_tools-1.71.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:32f3a67b10728835b5ffb63fbdbe696d00e19a27561b9cf5153e72dbb93021ba", size = 2913729, upload-time = "2025-06-28T04:20:49.641Z" }, + { url = "https://files.pythonhosted.org/packages/09/ae/ddb264b4a10c6c10336a7c177f8738b230c2c473d0c91dd5d8ce8ea1b857/grpcio_tools-1.71.2-cp310-cp310-win32.whl", hash = "sha256:7fcf9d92c710bfc93a1c0115f25e7d49a65032ff662b38b2f704668ce0a938df", size = 945997, upload-time = "2025-06-28T04:20:50.9Z" }, + { url = "https://files.pythonhosted.org/packages/ad/8d/5efd93698fe359f63719d934ebb2d9337e82d396e13d6bf00f4b06793e37/grpcio_tools-1.71.2-cp310-cp310-win_amd64.whl", hash = "sha256:914b4275be810290266e62349f2d020bb7cc6ecf9edb81da3c5cddb61a95721b", size = 1117474, upload-time = "2025-06-28T04:20:52.54Z" }, + { url = "https://files.pythonhosted.org/packages/17/e4/0568d38b8da6237ea8ea15abb960fb7ab83eb7bb51e0ea5926dab3d865b1/grpcio_tools-1.71.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:0acb8151ea866be5b35233877fbee6445c36644c0aa77e230c9d1b46bf34b18b", size = 2385557, upload-time = "2025-06-28T04:20:54.323Z" }, + { url = "https://files.pythonhosted.org/packages/76/fb/700d46f72b0f636cf0e625f3c18a4f74543ff127471377e49a071f64f1e7/grpcio_tools-1.71.2-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:b28f8606f4123edb4e6da281547465d6e449e89f0c943c376d1732dc65e6d8b3", size = 5447590, upload-time = "2025-06-28T04:20:55.836Z" }, + { url = "https://files.pythonhosted.org/packages/12/69/d9bb2aec3de305162b23c5c884b9f79b1a195d42b1e6dabcc084cc9d0804/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:cbae6f849ad2d1f5e26cd55448b9828e678cb947fa32c8729d01998238266a6a", size = 2348495, upload-time = "2025-06-28T04:20:57.33Z" }, + { url = "https://files.pythonhosted.org/packages/d5/83/f840aba1690461b65330efbca96170893ee02fae66651bcc75f28b33a46c/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4d1027615cfb1e9b1f31f2f384251c847d68c2f3e025697e5f5c72e26ed1316", size = 2742333, upload-time = "2025-06-28T04:20:59.051Z" }, + { url = "https://files.pythonhosted.org/packages/30/34/c02cd9b37de26045190ba665ee6ab8597d47f033d098968f812d253bbf8c/grpcio_tools-1.71.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bac95662dc69338edb9eb727cc3dd92342131b84b12b3e8ec6abe973d4cbf1b", size = 2473490, upload-time = "2025-06-28T04:21:00.614Z" }, + { url = "https://files.pythonhosted.org/packages/4d/c7/375718ae091c8f5776828ce97bdcb014ca26244296f8b7f70af1a803ed2f/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c50250c7248055040f89eb29ecad39d3a260a4b6d3696af1575945f7a8d5dcdc", size = 2850333, upload-time = "2025-06-28T04:21:01.95Z" }, + { url = "https://files.pythonhosted.org/packages/19/37/efc69345bd92a73b2bc80f4f9e53d42dfdc234b2491ae58c87da20ca0ea5/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6ab1ad955e69027ef12ace4d700c5fc36341bdc2f420e87881e9d6d02af3d7b8", size = 3300748, upload-time = "2025-06-28T04:21:03.451Z" }, + { url = "https://files.pythonhosted.org/packages/d2/1f/15f787eb25ae42086f55ed3e4260e85f385921c788debf0f7583b34446e3/grpcio_tools-1.71.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dd75dde575781262b6b96cc6d0b2ac6002b2f50882bf5e06713f1bf364ee6e09", size = 2913178, upload-time = "2025-06-28T04:21:04.879Z" }, + { url = "https://files.pythonhosted.org/packages/12/aa/69cb3a9dff7d143a05e4021c3c9b5cde07aacb8eb1c892b7c5b9fb4973e3/grpcio_tools-1.71.2-cp311-cp311-win32.whl", hash = "sha256:9a3cb244d2bfe0d187f858c5408d17cb0e76ca60ec9a274c8fd94cc81457c7fc", size = 946256, upload-time = "2025-06-28T04:21:06.518Z" }, + { url = "https://files.pythonhosted.org/packages/1e/df/fb951c5c87eadb507a832243942e56e67d50d7667b0e5324616ffd51b845/grpcio_tools-1.71.2-cp311-cp311-win_amd64.whl", hash = "sha256:00eb909997fd359a39b789342b476cbe291f4dd9c01ae9887a474f35972a257e", size = 1117661, upload-time = "2025-06-28T04:21:08.18Z" }, + { url = "https://files.pythonhosted.org/packages/9c/d3/3ed30a9c5b2424627b4b8411e2cd6a1a3f997d3812dbc6a8630a78bcfe26/grpcio_tools-1.71.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:bfc0b5d289e383bc7d317f0e64c9dfb59dc4bef078ecd23afa1a816358fb1473", size = 2385479, upload-time = "2025-06-28T04:21:10.413Z" }, + { url = "https://files.pythonhosted.org/packages/54/61/e0b7295456c7e21ef777eae60403c06835160c8d0e1e58ebfc7d024c51d3/grpcio_tools-1.71.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b4669827716355fa913b1376b1b985855d5cfdb63443f8d18faf210180199006", size = 5431521, upload-time = "2025-06-28T04:21:12.261Z" }, + { url = "https://files.pythonhosted.org/packages/75/d7/7bcad6bcc5f5b7fab53e6bce5db87041f38ef3e740b1ec2d8c49534fa286/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:d4071f9b44564e3f75cdf0f05b10b3e8c7ea0ca5220acbf4dc50b148552eef2f", size = 2350289, upload-time = "2025-06-28T04:21:13.625Z" }, + { url = "https://files.pythonhosted.org/packages/b2/8a/e4c1c4cb8c9ff7f50b7b2bba94abe8d1e98ea05f52a5db476e7f1c1a3c70/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a28eda8137d587eb30081384c256f5e5de7feda34776f89848b846da64e4be35", size = 2743321, upload-time = "2025-06-28T04:21:15.007Z" }, + { url = "https://files.pythonhosted.org/packages/fd/aa/95bc77fda5c2d56fb4a318c1b22bdba8914d5d84602525c99047114de531/grpcio_tools-1.71.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b19c083198f5eb15cc69c0a2f2c415540cbc636bfe76cea268e5894f34023b40", size = 2474005, upload-time = "2025-06-28T04:21:16.443Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ff/ca11f930fe1daa799ee0ce1ac9630d58a3a3deed3dd2f465edb9a32f299d/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:784c284acda0d925052be19053d35afbf78300f4d025836d424cf632404f676a", size = 2851559, upload-time = "2025-06-28T04:21:18.139Z" }, + { url = "https://files.pythonhosted.org/packages/64/10/c6fc97914c7e19c9bb061722e55052fa3f575165da9f6510e2038d6e8643/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:381e684d29a5d052194e095546eef067201f5af30fd99b07b5d94766f44bf1ae", size = 3300622, upload-time = "2025-06-28T04:21:20.291Z" }, + { url = "https://files.pythonhosted.org/packages/e5/d6/965f36cfc367c276799b730d5dd1311b90a54a33726e561393b808339b04/grpcio_tools-1.71.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3e4b4801fabd0427fc61d50d09588a01b1cfab0ec5e8a5f5d515fbdd0891fd11", size = 2913863, upload-time = "2025-06-28T04:21:22.196Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f0/c05d5c3d0c1d79ac87df964e9d36f1e3a77b60d948af65bec35d3e5c75a3/grpcio_tools-1.71.2-cp312-cp312-win32.whl", hash = "sha256:84ad86332c44572305138eafa4cc30040c9a5e81826993eae8227863b700b490", size = 945744, upload-time = "2025-06-28T04:21:23.463Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e9/c84c1078f0b7af7d8a40f5214a9bdd8d2a567ad6c09975e6e2613a08d29d/grpcio_tools-1.71.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e1108d37eecc73b1c4a27350a6ed921b5dda25091700c1da17cfe30761cd462", size = 1117695, upload-time = "2025-06-28T04:21:25.22Z" }, + { url = "https://files.pythonhosted.org/packages/60/9c/bdf9c5055a1ad0a09123402d73ecad3629f75b9cf97828d547173b328891/grpcio_tools-1.71.2-cp313-cp313-linux_armv7l.whl", hash = "sha256:b0f0a8611614949c906e25c225e3360551b488d10a366c96d89856bcef09f729", size = 2384758, upload-time = "2025-06-28T04:21:26.712Z" }, + { url = "https://files.pythonhosted.org/packages/49/d0/6aaee4940a8fb8269c13719f56d69c8d39569bee272924086aef81616d4a/grpcio_tools-1.71.2-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:7931783ea7ac42ac57f94c5047d00a504f72fbd96118bf7df911bb0e0435fc0f", size = 5443127, upload-time = "2025-06-28T04:21:28.383Z" }, + { url = "https://files.pythonhosted.org/packages/d9/11/50a471dcf301b89c0ed5ab92c533baced5bd8f796abfd133bbfadf6b60e5/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:d188dc28e069aa96bb48cb11b1338e47ebdf2e2306afa58a8162cc210172d7a8", size = 2349627, upload-time = "2025-06-28T04:21:30.254Z" }, + { url = "https://files.pythonhosted.org/packages/bb/66/e3dc58362a9c4c2fbe98a7ceb7e252385777ebb2bbc7f42d5ab138d07ace/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f36c4b3cc42ad6ef67430639174aaf4a862d236c03c4552c4521501422bfaa26", size = 2742932, upload-time = "2025-06-28T04:21:32.325Z" }, + { url = "https://files.pythonhosted.org/packages/b7/1e/1e07a07ed8651a2aa9f56095411198385a04a628beba796f36d98a5a03ec/grpcio_tools-1.71.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bd9ed12ce93b310f0cef304176049d0bc3b9f825e9c8c6a23e35867fed6affd", size = 2473627, upload-time = "2025-06-28T04:21:33.752Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f9/3b7b32e4acb419f3a0b4d381bc114fe6cd48e3b778e81273fc9e4748caad/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7ce27e76dd61011182d39abca38bae55d8a277e9b7fe30f6d5466255baccb579", size = 2850879, upload-time = "2025-06-28T04:21:35.241Z" }, + { url = "https://files.pythonhosted.org/packages/1e/99/cd9e1acd84315ce05ad1fcdfabf73b7df43807cf00c3b781db372d92b899/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:dcc17bf59b85c3676818f2219deacac0156492f32ca165e048427d2d3e6e1157", size = 3300216, upload-time = "2025-06-28T04:21:36.826Z" }, + { url = "https://files.pythonhosted.org/packages/9f/c0/66eab57b14550c5b22404dbf60635c9e33efa003bd747211981a9859b94b/grpcio_tools-1.71.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:706360c71bdd722682927a1fb517c276ccb816f1e30cb71f33553e5817dc4031", size = 2913521, upload-time = "2025-06-28T04:21:38.347Z" }, + { url = "https://files.pythonhosted.org/packages/05/9b/7c90af8f937d77005625d705ab1160bc42a7e7b021ee5c788192763bccd6/grpcio_tools-1.71.2-cp313-cp313-win32.whl", hash = "sha256:bcf751d5a81c918c26adb2d6abcef71035c77d6eb9dd16afaf176ee096e22c1d", size = 945322, upload-time = "2025-06-28T04:21:39.864Z" }, + { url = "https://files.pythonhosted.org/packages/5f/80/6db6247f767c94fe551761772f89ceea355ff295fd4574cb8efc8b2d1199/grpcio_tools-1.71.2-cp313-cp313-win_amd64.whl", hash = "sha256:b1581a1133552aba96a730178bc44f6f1a071f0eb81c5b6bc4c0f89f5314e2b8", size = 1117234, upload-time = "2025-06-28T04:21:41.893Z" }, +] + [[package]] name = "gunicorn" version = "23.0.0" @@ -3174,6 +3280,13 @@ semantic-router = [ { name = "aurelio-sdk" }, { name = "semantic-router" }, ] +stt-nvidia-riva = [ + { name = "audioread" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, + { name = "nvidia-riva-client" }, + { name = "soundfile" }, +] utils = [ { name = "numpydoc" }, ] @@ -3264,6 +3377,7 @@ requires-dist = [ { name = "aiohttp", specifier = "==3.13.4" }, { name = "anthropic", extras = ["vertex"], marker = "extra == 'proxy-runtime'", specifier = "==0.84.0" }, { name = "apscheduler", marker = "extra == 'proxy'", specifier = "==3.11.2" }, + { name = "audioread", marker = "extra == 'stt-nvidia-riva'", specifier = ">=3.0.1" }, { name = "aurelio-sdk", marker = "python_full_version < '3.14' and extra == 'semantic-router'", specifier = "==0.0.19" }, { name = "azure-ai-contentsafety", marker = "extra == 'proxy-runtime'", specifier = "==1.0.0" }, { name = "azure-identity", marker = "extra == 'extra-proxy'", specifier = "==1.25.2" }, @@ -3300,7 +3414,9 @@ requires-dist = [ { name = "mangum", marker = "extra == 'proxy-runtime'", specifier = "==0.17.0" }, { name = "mcp", marker = "extra == 'proxy'", specifier = "==1.26.0" }, { name = "mlflow", marker = "extra == 'mlflow'", specifier = "==3.11.1" }, + { name = "numpy", marker = "extra == 'stt-nvidia-riva'", specifier = ">=1.26.0" }, { name = "numpydoc", marker = "extra == 'utils'", specifier = "==1.8.0" }, + { name = "nvidia-riva-client", marker = "extra == 'stt-nvidia-riva'", specifier = ">=2.15.0" }, { name = "openai", specifier = "==2.33.0" }, { name = "opentelemetry-api", marker = "extra == 'proxy-runtime'", specifier = "==1.28.0" }, { name = "opentelemetry-exporter-otlp", marker = "extra == 'proxy-runtime'", specifier = "==1.28.0" }, @@ -3325,13 +3441,14 @@ requires-dist = [ { name = "semantic-router", marker = "python_full_version < '3.14' and extra == 'semantic-router'", specifier = "==0.1.12" }, { name = "sentry-sdk", marker = "extra == 'proxy-runtime'", specifier = "==2.21.0" }, { name = "soundfile", marker = "extra == 'proxy'", specifier = "==0.12.1" }, + { name = "soundfile", marker = "extra == 'stt-nvidia-riva'", specifier = ">=0.12.1" }, { name = "tiktoken", specifier = "==0.12.0" }, { name = "tokenizers", specifier = "==0.23.1" }, { name = "uvicorn", marker = "extra == 'proxy'", specifier = "==0.33.0" }, { name = "uvloop", marker = "sys_platform != 'win32' and extra == 'proxy'", specifier = "==0.21.0" }, { name = "websockets", marker = "extra == 'proxy'", specifier = "==15.0.1" }, ] -provides-extras = ["proxy", "extra-proxy", "utils", "caching", "semantic-router", "mlflow", "grpc", "google", "proxy-runtime"] +provides-extras = ["proxy", "extra-proxy", "utils", "caching", "semantic-router", "mlflow", "grpc", "stt-nvidia-riva", "google", "proxy-runtime"] [package.metadata.requires-dev] ci = [ @@ -4156,6 +4273,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/45/56d99ba9366476cd8548527667f01869279cedb9e66b28eb4dfb27701679/numpydoc-1.8.0-py3-none-any.whl", hash = "sha256:72024c7fd5e17375dec3608a27c03303e8ad00c81292667955c6fea7a3ccf541", size = 64003, upload-time = "2024-08-09T15:52:37.276Z" }, ] +[[package]] +name = "nvidia-riva-client" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio-tools" }, + { name = "setuptools" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/82/0484c225bebe7ed37334474fba5c6ac7228638e692b84da0a0e7f2395672/nvidia_riva_client-2.16.0-py3-none-any.whl", hash = "sha256:99ef37b8f487d75a70c053736848221e09b728e5c910fb476333d375bd4347a3", size = 45491, upload-time = "2024-07-02T14:54:22.63Z" }, +] + [[package]] name = "oauthlib" version = "3.3.1" @@ -7068,6 +7197,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ff/07/45c21ed03d708c477367305726b89919b020a3a2a01f72aaf5ad941caf35/sse_starlette-3.4.1-py3-none-any.whl", hash = "sha256:6b43cf21f1d574d582a6e1b0cfbde1c94dc86a32a701a7168c99c4475c6bd1d0", size = 16487, upload-time = "2026-04-26T13:32:30.819Z" }, ] +[[package]] +name = "standard-aifc" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, + { name = "standard-chunk", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" }, +] + +[[package]] +name = "standard-chunk" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" }, +] + +[[package]] +name = "standard-sunau" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" }, +] + [[package]] name = "starlette" version = "0.50.0"