feat(openai): round-trip Responses API reasoning_items in chat completions

Made-with: Cursor
This commit is contained in:
Sameer Kankute
2026-03-25 14:30:15 +05:30
parent b20cff8673
commit 00a810e92d
6 changed files with 474 additions and 13 deletions
+84
View File
@@ -581,6 +581,90 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
See [OpenAI Reasoning documentation](https://platform.openai.com/docs/guides/reasoning) for more details on organization verification requirements.
### Multi-turn Conversations with `reasoning_items`
For multi-turn conversations you need `reasoning_items`: structured blocks that include the `encrypted_content` token OpenAI uses to restore reasoning state on the next request. Pass `include=["reasoning.encrypted_content"]` on every call where you want that token returned.
<Tabs>
<TabItem value="non-streaming" label="Non-Streaming">
```python showLineNumbers title="Non-streaming: round-trip reasoning_items"
import litellm
messages = [{"role": "user", "content": "Solve this step by step: 2 + 2"}]
# Turn 1 — get reasoning_items (encrypted_content);
response = litellm.completion(
model="openai/responses/gpt-5-mini",
messages=messages,
reasoning_effort="low",
include=["reasoning.encrypted_content"],
)
assistant_msg = response.choices[0].message
# Turn 2 — pass reasoning_items back; LiteLLM converts to the correct Responses API format
messages.append({
"role": "assistant",
"content": assistant_msg.content,
"reasoning_items": assistant_msg.reasoning_items,
})
messages.append({"role": "user", "content": "Now summarize your reasoning."})
response2 = litellm.completion(
model="openai/responses/gpt-5-mini",
messages=messages,
reasoning_effort="low",
include=["reasoning.encrypted_content"],
)
```
</TabItem>
<TabItem value="streaming" label="Streaming">
`reasoning_items` (with `encrypted_content`) arrive on the final chunk when the full response completes:
```python showLineNumbers title="Streaming: collect and round-trip reasoning_items"
import litellm
messages = [{"role": "user", "content": "Solve this step by step: 2 + 2"}]
collected_content = []
collected_reasoning_items = []
stream = litellm.completion(
model="openai/responses/gpt-5-mini",
messages=messages,
stream=True,
reasoning_effort="low",
include=["reasoning.encrypted_content"],
)
for chunk in stream:
delta = chunk.choices[0].delta
if delta.content:
collected_content.append(delta.content)
if getattr(delta, "reasoning_items", None):
collected_reasoning_items.extend(delta.reasoning_items)
messages.append({
"role": "assistant",
"content": "".join(collected_content),
"reasoning_items": collected_reasoning_items or None,
})
messages.append({"role": "user", "content": "Continue the conversation."})
response2 = litellm.completion(
model="openai/responses/gpt-5-mini",
messages=messages,
reasoning_effort="low",
include=["reasoning.encrypted_content"],
)
```
</TabItem>
</Tabs>
### Verbosity Control for GPT-5 Models
The `verbosity` parameter controls the length and detail of responses from GPT-5 family models. It accepts three values: `"low"`, `"medium"`, or `"high"`.
@@ -32,6 +32,7 @@ from litellm.llms.base_llm.bridges.completion_transformation import (
)
from litellm.types.llms.openai import (
ChatCompletionAnnotation,
ChatCompletionReasoningItem,
ChatCompletionToolParamFunctionChunk,
Reasoning,
ResponsesAPIOptionalRequestParams,
@@ -55,6 +56,49 @@ if TYPE_CHECKING:
)
def _build_reasoning_item(
item_id: str,
encrypted_content: Optional[str],
summary_raw: Any,
) -> Dict[str, Any]:
"""Build a ChatCompletionReasoningItem-shaped dict from raw response data.
Handles both pydantic objects (attribute access) and plain dicts.
"""
summary: List[Dict[str, Any]] = []
for s in summary_raw or []:
if isinstance(s, dict):
summary.append(
{"type": s.get("type", "summary_text"), "text": s.get("text", "")}
)
else:
summary.append(
{
"type": getattr(s, "type", "summary_text"),
"text": getattr(s, "text", ""),
}
)
return {
"id": item_id,
"type": "reasoning",
"encrypted_content": encrypted_content,
"summary": summary,
}
def _reasoning_item_to_response_input(r_item: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a stored ChatCompletionReasoningItem back to a Responses API input item."""
r_input: Dict[str, Any] = {
"type": "reasoning",
"id": r_item.get("id") or f"rs_{id(r_item)}",
# summary is always required by the Responses API, even when empty
"summary": r_item.get("summary") or [],
}
if r_item.get("encrypted_content"):
r_input["encrypted_content"] = r_item["encrypted_content"]
return r_input
class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
"""
Handler for transforming /chat/completions api requests to litellm.responses requests
@@ -202,10 +246,12 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
}
)
elif role == "assistant" and tool_calls and isinstance(tool_calls, list):
for r_item in msg.get("reasoning_items") or []:
input_items.append(_reasoning_item_to_response_input(r_item))
for tool_call in tool_calls:
function = tool_call.get("function")
if function:
input_tool_call = {
input_tool_call: Dict[str, Any] = {
"type": "function_call",
"call_id": tool_call["id"],
}
@@ -217,7 +263,9 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
else:
raise ValueError(f"tool call not supported: {tool_call}")
elif content is not None:
# Regular user/assistant message
if role == "assistant":
for r_item in msg.get("reasoning_items") or []:
input_items.append(_reasoning_item_to_response_input(r_item))
input_items.append(
{
"type": "message",
@@ -411,6 +459,7 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
choices: List[Choices] = []
index = 0
reasoning_content: Optional[str] = None
pending_reasoning_item: Optional[Dict[str, Any]] = None
# Collect all tool calls to put them in a single choice
# (Chat Completions API expects all tool calls in one message)
@@ -419,9 +468,16 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
for item in output_items:
if isinstance(item, ResponseReasoningItem):
for summary_item in item.summary:
response_text = getattr(summary_item, "text", "")
reasoning_content = response_text if response_text else ""
pending_reasoning_item = _build_reasoning_item(
item_id=item.id,
encrypted_content=getattr(item, "encrypted_content", None),
summary_raw=item.summary,
)
reasoning_content = " ".join(
s["text"]
for s in pending_reasoning_item["summary"]
if s.get("text")
)
elif isinstance(item, ResponseOutputMessage):
for content in item.content:
@@ -436,6 +492,12 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
content=response_text if response_text else "",
reasoning_content=reasoning_content,
annotations=annotations,
reasoning_items=cast(
Optional[List[ChatCompletionReasoningItem]],
[pending_reasoning_item]
if pending_reasoning_item is not None
else None,
),
)
choices.append(
@@ -446,7 +508,8 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
)
)
reasoning_content = None # flush reasoning content
reasoning_content = None # flush
pending_reasoning_item = None # flush
index += 1
elif isinstance(item, ResponseFunctionToolCall):
@@ -489,11 +552,18 @@ class LiteLLMResponsesTransformationHandler(CompletionTransformationBridge):
content=None,
tool_calls=accumulated_tool_calls,
reasoning_content=reasoning_content,
reasoning_items=cast(
Optional[List[ChatCompletionReasoningItem]],
[pending_reasoning_item]
if pending_reasoning_item is not None
else None,
),
)
choices.append(
Choices(message=msg, finish_reason="tool_calls", index=index)
)
reasoning_content = None
pending_reasoning_item = None
return choices
@@ -1232,6 +1302,25 @@ class OpenAiResponsesToChatCompletionStreamIterator(BaseModelResponseIterator):
finish_reason = "tool_calls" if has_function_calls else "stop"
# Extract reasoning items with encrypted_content for round-tripping
completed_reasoning_items: Optional[List[Dict[str, Any]]] = None
for item in output_items:
if not isinstance(item, dict) or item.get("type") != "reasoning":
continue
if completed_reasoning_items is None:
completed_reasoning_items = []
completed_reasoning_items.append(
_build_reasoning_item(
item_id=item.get("id", ""),
encrypted_content=item.get("encrypted_content"),
summary_raw=item.get("summary"),
)
)
completed_reasoning_items_typed = cast(
Optional[List[ChatCompletionReasoningItem]],
completed_reasoning_items,
)
usage = None
if response_data.get("usage"):
from litellm.responses.utils import ResponseAPILoggingUtils
@@ -1245,7 +1334,10 @@ class OpenAiResponsesToChatCompletionStreamIterator(BaseModelResponseIterator):
choices=[
StreamingChoices(
index=0,
delta=Delta(content=""),
delta=Delta(
content="",
reasoning_items=completed_reasoning_items_typed,
),
finish_reason=finish_reason,
)
],
@@ -831,6 +831,10 @@ class CustomStreamWrapper:
"annotations" in model_response.choices[0].delta
and model_response.choices[0].delta.annotations is not None
)
or (
getattr(model_response.choices[0].delta, "reasoning_items", None)
is not None
)
):
return True
else:
+14
View File
@@ -536,6 +536,20 @@ class ChatCompletionRedactedThinkingBlock(TypedDict, total=False):
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class ChatCompletionReasoningSummaryTextBlock(TypedDict, total=False):
type: Required[Literal["summary_text"]]
text: str
class ChatCompletionReasoningItem(TypedDict, total=False):
"""Represents an OpenAI Responses API reasoning item for round-tripping in conversation history."""
type: Required[Literal["reasoning"]]
id: str
encrypted_content: Optional[str]
summary: List["ChatCompletionReasoningSummaryTextBlock"]
class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
city: str
"""Free text input for the city of the user, e.g. `San Francisco`."""
+20
View File
@@ -58,6 +58,7 @@ from .llms.openai import (
AllMessageValues,
Batch,
ChatCompletionAnnotation,
ChatCompletionReasoningItem,
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
@@ -1132,6 +1133,7 @@ class Message(SafeAttributeModel, OpenAIObject):
thinking_blocks: Optional[
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]
] = None
reasoning_items: Optional[List[ChatCompletionReasoningItem]] = None
provider_specific_fields: Optional[Dict[str, Any]] = Field(default=None)
annotations: Optional[List[ChatCompletionAnnotation]] = None
@@ -1150,6 +1152,7 @@ class Message(SafeAttributeModel, OpenAIObject):
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None,
reasoning_items: Optional[List[ChatCompletionReasoningItem]] = None,
annotations: Optional[List[ChatCompletionAnnotation]] = None,
**params,
):
@@ -1182,6 +1185,9 @@ class Message(SafeAttributeModel, OpenAIObject):
if thinking_blocks is not None:
init_values["thinking_blocks"] = thinking_blocks
if reasoning_items is not None:
init_values["reasoning_items"] = reasoning_items
if annotations is not None:
init_values["annotations"] = annotations
@@ -1219,6 +1225,11 @@ class Message(SafeAttributeModel, OpenAIObject):
if hasattr(self, "thinking_blocks"):
del self.thinking_blocks
if reasoning_items is None:
# ensure default response matches OpenAI spec
if hasattr(self, "reasoning_items"):
del self.reasoning_items
add_provider_specific_fields(self, provider_specific_fields)
def get(self, key, default=None):
@@ -1246,6 +1257,7 @@ class Delta(SafeAttributeModel, OpenAIObject):
thinking_blocks: Optional[
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]
] = None
reasoning_items: Optional[List[ChatCompletionReasoningItem]] = None
provider_specific_fields: Optional[Dict[str, Any]] = Field(default=None)
def __init__(
@@ -1262,6 +1274,7 @@ class Delta(SafeAttributeModel, OpenAIObject):
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None,
reasoning_items: Optional[List[ChatCompletionReasoningItem]] = None,
annotations: Optional[List[ChatCompletionAnnotation]] = None,
**params,
):
@@ -1295,6 +1308,13 @@ class Delta(SafeAttributeModel, OpenAIObject):
# ensure default response matches OpenAI spec
del self.thinking_blocks
if reasoning_items is not None:
self.reasoning_items = reasoning_items
else:
# ensure default response matches OpenAI spec
if hasattr(self, "reasoning_items"):
del self.reasoning_items
# Add annotations to the delta, ensure they are only on Delta if they exist (Match OpenAI spec)
if annotations is not None:
self.annotations = annotations
@@ -2127,9 +2127,10 @@ def test_convert_chat_completion_file_type_to_input_file():
}
]
input_items, instructions = (
handler.convert_chat_completion_messages_to_responses_api(messages)
)
(
input_items,
instructions,
) = handler.convert_chat_completion_messages_to_responses_api(messages)
assert len(input_items) == 1
msg = input_items[0]
@@ -2176,11 +2177,257 @@ def test_convert_chat_completion_file_type_with_file_id():
}
]
input_items, instructions = (
handler.convert_chat_completion_messages_to_responses_api(messages)
)
(
input_items,
instructions,
) = handler.convert_chat_completion_messages_to_responses_api(messages)
content = input_items[0]["content"]
assert content[1]["type"] == "input_file"
assert content[1]["file_id"] == "file-abc123"
assert "file_data" not in content[1]
# =============================================================================
# Tests for reasoning_items round-trip (encrypted_content preservation)
# =============================================================================
def test_reasoning_items_non_streaming_round_trip():
"""
Non-streaming: verify that reasoning_items (with encrypted_content) are:
1. Extracted from ResponseReasoningItem and attached to the Message.
2. Emitted as a 'reasoning' input item when the assistant message is
passed back to convert_chat_completion_messages_to_responses_api.
"""
from unittest.mock import Mock
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
from openai.types.responses.response_reasoning_item import (
ResponseReasoningItem,
Summary,
)
from litellm.completion_extras.litellm_responses_transformation.transformation import (
LiteLLMResponsesTransformationHandler,
)
from litellm.types.llms.openai import (
InputTokensDetails,
OutputTokensDetails,
ResponseAPIUsage,
ResponsesAPIResponse,
)
from litellm.types.utils import ModelResponse, Usage
handler = LiteLLMResponsesTransformationHandler()
encrypted = "gAAAAABpw5abc123FAKE=="
summary_text = "**Thinking about it**\n\nSome reasoning here."
reasoning_item = ResponseReasoningItem(
id="rs_test001",
summary=[Summary(text=summary_text, type="summary_text")],
type="reasoning",
content=None,
encrypted_content=encrypted,
status=None,
)
output_message = ResponseOutputMessage(
id="msg_test001",
content=[
ResponseOutputText(
annotations=[],
text="The answer is 42.",
type="output_text",
logprobs=[],
)
],
role="assistant",
status="completed",
type="message",
)
usage = ResponseAPIUsage(
input_tokens=10,
input_tokens_details=InputTokensDetails(
audio_tokens=None, cached_tokens=0, text_tokens=None
),
output_tokens=20,
output_tokens_details=OutputTokensDetails(reasoning_tokens=0, text_tokens=None),
total_tokens=30,
cost=None,
)
raw_response = ResponsesAPIResponse(
id="resp_test001",
created_at=1234567890,
error=None,
incomplete_details=None,
instructions=None,
metadata={},
model="gpt-5-mini",
object="response",
output=[reasoning_item, output_message],
parallel_tool_calls=True,
temperature=1.0,
tool_choice="auto",
tools=[],
top_p=1.0,
max_output_tokens=None,
previous_response_id=None,
reasoning={"effort": "low", "summary": "detailed"},
status="completed",
text={"format": {"type": "text"}, "verbosity": "medium"},
truncation="disabled",
usage=usage,
user=None,
store=True,
background=False,
billing={"payer": "developer"},
max_tool_calls=None,
prompt_cache_key=None,
safety_identifier=None,
service_tier="default",
top_logprobs=0,
)
model_response = ModelResponse(
id="chatcmpl-test001",
created=1234567890,
model=None,
object="chat.completion",
system_fingerprint=None,
choices=[],
usage=Usage(completion_tokens=0, prompt_tokens=0, total_tokens=0),
)
result = handler.transform_response(
model="gpt-5-mini",
raw_response=raw_response,
model_response=model_response,
logging_obj=Mock(),
request_data={"model": "gpt-5-mini"},
messages=[{"role": "user", "content": "What is the answer?"}],
optional_params={},
litellm_params={},
encoding=Mock(),
)
# ── Part 1: reasoning_items on the response message ──────────────────────
assert len(result.choices) == 1
msg = result.choices[0].message
assert (
msg.reasoning_content == summary_text
), "reasoning_content should equal summary text"
assert msg.reasoning_items is not None, "reasoning_items should be set"
assert len(msg.reasoning_items) == 1
ri = msg.reasoning_items[0]
assert ri["type"] == "reasoning"
assert ri["id"] == "rs_test001"
assert ri["encrypted_content"] == encrypted, "encrypted_content must be preserved"
assert len(ri["summary"]) == 1
assert ri["summary"][0]["text"] == summary_text
# ── Part 2: reasoning item round-trips through message history ────────────
history = [
{"role": "user", "content": "What is the answer?"},
{
"role": "assistant",
"content": msg.content,
"reasoning_items": msg.reasoning_items,
},
{"role": "user", "content": "Can you elaborate?"},
]
input_items, _ = handler.convert_chat_completion_messages_to_responses_api(history)
# The reasoning input item must appear before the assistant message item
types = [item.get("type") for item in input_items]
assert (
"reasoning" in types
), "reasoning input item must be emitted for the assistant turn"
reasoning_input = next(
item for item in input_items if item.get("type") == "reasoning"
)
assert reasoning_input["id"] == "rs_test001"
assert reasoning_input["encrypted_content"] == encrypted
assert reasoning_input["summary"][0]["text"] == summary_text
# reasoning item must come before the assistant message item
reasoning_idx = types.index("reasoning")
assistant_msg_idx = next(
i
for i, item in enumerate(input_items)
if item.get("type") == "message" and item.get("role") == "assistant"
)
assert (
reasoning_idx < assistant_msg_idx
), "reasoning input item must precede the assistant message item"
def test_reasoning_items_streaming_emitted_on_response_completed():
"""
Streaming: verify that reasoning_items (with encrypted_content) are emitted
on the delta of the response.completed chunk, enabling the caller to
round-trip them in subsequent requests.
"""
from litellm.completion_extras.litellm_responses_transformation.transformation import (
OpenAiResponsesToChatCompletionStreamIterator,
)
iterator = OpenAiResponsesToChatCompletionStreamIterator(
streaming_response=None, sync_stream=True
)
encrypted = "gAAAAABpw5xyz987FAKE=="
summary_text = "**Reasoning summary**\n\nModel thought about this carefully."
chunk = {
"type": "response.completed",
"response": {
"id": "resp_stream001",
"status": "completed",
"output": [
{
"type": "reasoning",
"id": "rs_stream001",
"encrypted_content": encrypted,
"summary": [{"type": "summary_text", "text": summary_text}],
},
{
"type": "message",
"id": "msg_stream001",
"role": "assistant",
"content": [{"type": "output_text", "text": "The answer."}],
"status": "completed",
},
],
"usage": {
"input_tokens": 10,
"output_tokens": 5,
"total_tokens": 15,
"input_tokens_details": {"cached_tokens": 0},
"output_tokens_details": {"reasoning_tokens": 0},
},
},
}
result = iterator.chunk_parser(chunk)
assert len(result.choices) == 1
delta = result.choices[0].delta
# finish_reason must be set (response is complete)
assert result.choices[0].finish_reason == "stop"
# reasoning_items must be on the delta
assert (
getattr(delta, "reasoning_items", None) is not None
), "reasoning_items must be present on the response.completed delta"
assert len(delta.reasoning_items) == 1
ri = delta.reasoning_items[0]
assert ri["type"] == "reasoning"
assert ri["id"] == "rs_stream001"
assert (
ri["encrypted_content"] == encrypted
), "encrypted_content must be preserved in streaming"
assert ri["summary"][0]["text"] == summary_text