diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 9a4b158b62..88029615ba 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -2425,12 +2425,11 @@ class BaseTokenUsageProcessor: if not attr.startswith("_") and not callable( getattr(usage.completion_tokens_details, attr) ): - current_val = getattr( - combined.completion_tokens_details, attr, 0 + current_val = ( + getattr(combined.completion_tokens_details, attr, 0) or 0 ) - new_val = getattr(usage.completion_tokens_details, attr, 0) - - if new_val is not None and current_val is not None: + new_val = getattr(usage.completion_tokens_details, attr, 0) or 0 + if isinstance(new_val, (int, float)): setattr( combined.completion_tokens_details, attr, diff --git a/litellm/responses/utils.py b/litellm/responses/utils.py index 46a2894bd1..60badb57d2 100644 --- a/litellm/responses/utils.py +++ b/litellm/responses/utils.py @@ -1006,6 +1006,20 @@ class ResponseAPILoggingUtils: ) response_api_usage: ResponseAPIUsage if isinstance(usage_input, dict): + usage_input = dict(usage_input) # shallow copy; avoid mutating caller + # Realtime *_token_details → *_tokens_details when unset. + if ( + usage_input.get("input_tokens_details") is None + and "input_token_details" in usage_input + ): + usage_input["input_tokens_details"] = usage_input["input_token_details"] + if ( + usage_input.get("output_tokens_details") is None + and "output_token_details" in usage_input + ): + usage_input["output_tokens_details"] = usage_input[ + "output_token_details" + ] total_tokens = usage_input.get("total_tokens") if total_tokens is None: input_tokens = usage_input.get("input_tokens") @@ -1050,6 +1064,7 @@ class ResponseAPILoggingUtils: ), image_tokens=getattr(output_tokens_details, "image_tokens", None), text_tokens=getattr(output_tokens_details, "text_tokens", None), + audio_tokens=getattr(output_tokens_details, "audio_tokens", None), ) chat_usage = Usage( diff --git a/tests/test_litellm/responses/test_responses_utils.py b/tests/test_litellm/responses/test_responses_utils.py index 60b84f0e0a..bd44132150 100644 --- a/tests/test_litellm/responses/test_responses_utils.py +++ b/tests/test_litellm/responses/test_responses_utils.py @@ -327,7 +327,8 @@ class TestResponseAPILoggingUtils: "output_tokens_details": { "reasoning_tokens": 30, "image_tokens": 100, - "text_tokens": 70, + "text_tokens": 50, + "audio_tokens": 20, }, } @@ -346,7 +347,61 @@ class TestResponseAPILoggingUtils: assert result.completion_tokens_details is not None assert result.completion_tokens_details.reasoning_tokens == 30 assert result.completion_tokens_details.image_tokens == 100 - assert result.completion_tokens_details.text_tokens == 70 + assert result.completion_tokens_details.text_tokens == 50 + assert result.completion_tokens_details.audio_tokens == 20 + + def test_transform_response_api_usage_with_realtime_keys(self): + """Realtime input_token_details / output_token_details normalize for Usage.""" + usage = { + "input_tokens": 10, + "output_tokens": 20, + "total_tokens": 30, + "input_token_details": { + "text_tokens": 8, + "audio_tokens": 2, + "cached_tokens": 0, + }, + "output_token_details": { + "text_tokens": 12, + "audio_tokens": 8, + }, + } + + result = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( + usage + ) + + assert result.prompt_tokens_details is not None + assert result.prompt_tokens_details.text_tokens == 8 + assert result.prompt_tokens_details.audio_tokens == 2 + + assert result.completion_tokens_details is not None + assert result.completion_tokens_details.text_tokens == 12 + assert result.completion_tokens_details.audio_tokens == 8 + + def test_transform_response_api_usage_tokens_details_keep_values(self): + """Keeps input_tokens_details / output_tokens_details when singular keys are also present.""" + usage = { + "input_tokens": 10, + "output_tokens": 20, + "total_tokens": 30, + "input_tokens_details": {"text_tokens": 10}, + "output_tokens_details": {"text_tokens": 20}, + "input_token_details": {"text_tokens": 1, "audio_tokens": 99}, + "output_token_details": {"text_tokens": 2, "audio_tokens": 98}, + } + + result = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( + usage + ) + + assert result.prompt_tokens_details is not None + assert result.prompt_tokens_details.text_tokens == 10 + assert result.prompt_tokens_details.audio_tokens is None + + assert result.completion_tokens_details is not None + assert result.completion_tokens_details.text_tokens == 20 + assert result.completion_tokens_details.audio_tokens is None class TestResponsesAPIProviderSpecificParams: diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py index 3d45a3409d..82a4a60bf8 100644 --- a/tests/test_litellm/test_cost_calculator.py +++ b/tests/test_litellm/test_cost_calculator.py @@ -385,6 +385,51 @@ def test_handle_realtime_stream_cost_calculation(): ) assert cost == 0.0 # No usage, no cost + +def test_realtime_stream_combines_text_and_audio_token_details(): + """Realtime response.done usage with input_token_details / output_token_details.""" + from litellm.cost_calculator import RealtimeAPITokenUsageProcessor + + results: OpenAIRealtimeStreamList = [ + {"type": "session.created", "session": {"model": "gpt-4o-realtime-preview"}}, + { + "type": "response.done", + "response": { + "usage": { + "input_tokens": 10, + "output_tokens": 20, + "total_tokens": 30, + "input_token_details": {"text_tokens": 8, "audio_tokens": 2}, + "output_token_details": {"text_tokens": 12, "audio_tokens": 8}, + } + }, + }, + { + "type": "response.done", + "response": { + "usage": { + "input_tokens": 5, + "output_tokens": 15, + "total_tokens": 20, + "input_token_details": {"text_tokens": 3, "audio_tokens": 2}, + "output_token_details": {"text_tokens": 5, "audio_tokens": 10}, + } + }, + }, + ] + + combined = RealtimeAPITokenUsageProcessor.collect_and_combine_usage_from_realtime_stream_results( + results=results, + ) + + assert combined.prompt_tokens_details is not None + assert combined.prompt_tokens_details.text_tokens == 11 + assert combined.prompt_tokens_details.audio_tokens == 4 + + assert combined.completion_tokens_details is not None + assert combined.completion_tokens_details.text_tokens == 17 + assert combined.completion_tokens_details.audio_tokens == 18 + def test_realtime_logging_object_allows_null_transcript_in_conversation_item_added(): results: OpenAIRealtimeStreamList = [