feat: Implement vision capabilities and image generation tools, adding media handling, dedicated configurations, and trace optimization for image data.

2026-07-30 06:20:50 +00:00 · 2026-02-26 22:28:27 +07:00
parent 112fddb401
commit d5cc5a745d
19 changed files with 865 additions and 46 deletions
@@ -182,6 +182,10 @@ func runGateway() {
 	toolsReg.Register(webFetchTool)
 	slog.Info("web_fetch tool enabled")

+	// Vision fallback tool (for non-vision providers like MiniMax)
+	toolsReg.Register(tools.NewReadImageTool(providerRegistry))
+	toolsReg.Register(tools.NewCreateImageTool(providerRegistry))
+
 	// TTS (text-to-speech) system
 	ttsMgr := setupTTS(cfg)
 	if ttsMgr != nil {
@@ -150,6 +150,7 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
 		outCh := sched.ScheduleWithOpts(ctx, "main", agent.RunRequest{
 			SessionKey:        sessionKey,
 			Message:           msg.Content,
+			Media:             msg.Media,
 			Channel:           msg.Channel,
 			ChatID:            msg.ChatID,
 			PeerKind:          peerKind,
@@ -225,12 +226,28 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
 			}

 			// Publish response back to the channel
-			msgBus.PublishOutbound(bus.OutboundMessage{
+			outMsg := bus.OutboundMessage{
 				Channel:  channel,
 				ChatID:   chatID,
 				Content:  outcome.Result.Content,
 				Metadata: meta,
-			})
+			}
+
+			// Convert media results from agent run to outbound media attachments
+			for _, mr := range outcome.Result.Media {
+				outMsg.Media = append(outMsg.Media, bus.MediaAttachment{
+					URL:         mr.Path,
+					ContentType: mr.ContentType,
+				})
+				if mr.AsVoice {
+					if outMsg.Metadata == nil {
+						outMsg.Metadata = make(map[string]string)
+					}
+					outMsg.Metadata["audio_as_voice"] = "true"
+				}
+			}
+
+			msgBus.PublishOutbound(outMsg)
 		}(msg.Channel, msg.ChatID, sessionKey, runID, outMeta)
 	}

@@ -3,6 +3,7 @@ package cmd
 import (
 	"context"
 	"log/slog"
+	"strings"

 	"github.com/nextlevelbuilder/goclaw/internal/config"
 	"github.com/nextlevelbuilder/goclaw/internal/providers"
@@ -51,7 +52,8 @@ func registerProviders(registry *providers.Registry, cfg *config.Config) {
 	}

 	if cfg.Providers.MiniMax.APIKey != "" {
-		registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5"))
+		registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5").
+			WithChatPath("/text/chatcompletion_v2"))
 		slog.Info("registered provider", "name", "minimax")
 	}

@@ -82,7 +84,12 @@ func registerProvidersFromDB(registry *providers.Registry, provStore store.Provi
 		if p.ProviderType == "anthropic_native" {
 			registry.Register(providers.NewAnthropicProvider(p.APIKey))
 		} else {
-			registry.Register(providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, ""))
+			prov := providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, "")
+			// MiniMax native API uses a different chat path for vision support.
+			if p.Name == "minimax" && strings.Contains(p.APIBase, "minimax.io") {
+				prov.WithChatPath("/text/chatcompletion_v2")
+			}
+			registry.Register(prov)
 		}
 		slog.Info("registered provider from DB", "name", p.Name)
 	}
@@ -220,6 +220,7 @@ func NewLoop(cfg LoopConfig) *Loop {
 type RunRequest struct {
 	SessionKey       string // composite key: agent:{agentId}:{channel}:{peerKind}:{chatId}
 	Message          string // user message
+	Media            []string // local file paths to images (already sanitized)
 	Channel          string // source channel
 	ChatID           string // source chat ID
 	PeerKind         string // "direct" or "group" (for session key building and tool context)
@@ -235,10 +236,18 @@ type RunRequest struct {

 // RunResult is the output of a completed agent run.
 type RunResult struct {
-	Content    string      `json:"content"`
-	RunID      string      `json:"runId"`
-	Iterations int         `json:"iterations"`
+	Content    string           `json:"content"`
+	RunID      string           `json:"runId"`
+	Iterations int              `json:"iterations"`
 	Usage      *providers.Usage `json:"usage,omitempty"`
+	Media      []MediaResult    `json:"media,omitempty"` // media files from tool results (MEDIA: prefix)
+}
+
+// MediaResult represents a media file produced by a tool during the agent run.
+type MediaResult struct {
+	Path        string `json:"path"`                  // local file path
+	ContentType string `json:"content_type,omitempty"` // MIME type
+	AsVoice     bool   `json:"as_voice,omitempty"`     // send as voice message (Telegram OGG)
 }

 // Run processes a single message through the agent loop.
@@ -351,6 +360,15 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 	if req.SenderID != "" {
 		ctx = store.WithSenderID(ctx, req.SenderID)
 	}
+	// Inject per-agent vision/imagegen config for read_image/create_image tools
+	if l.agentToolPolicy != nil {
+		if l.agentToolPolicy.Vision != nil {
+			ctx = tools.WithVisionConfig(ctx, l.agentToolPolicy.Vision)
+		}
+		if l.agentToolPolicy.ImageGen != nil {
+			ctx = tools.WithImageGenConfig(ctx, l.agentToolPolicy.ImageGen)
+		}
+	}

 	// Per-user workspace isolation.
 	// Each user gets a subdirectory within the agent's workspace.
@@ -430,19 +448,37 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 	// (hadBootstrap) — no extra DB roundtrip needed for bootstrap detection.
 	messages, hadBootstrap := l.buildMessages(ctx, history, summary, req.Message, req.ExtraSystemPrompt, req.SessionKey, req.Channel, req.UserID, req.HistoryLimit)

-	// 2. Buffer new messages — write to session only AFTER the run completes.
+	// 2. Attach vision images to the current user message (last in messages slice).
+	// Images are only attached to the live request, NOT persisted in session history.
+	if len(req.Media) > 0 {
+		if images := loadImages(req.Media); len(images) > 0 {
+			messages[len(messages)-1].Images = images
+			ctx = tools.WithMediaImages(ctx, images) // make images available to read_image tool
+			slog.Info("vision: attached images to user message", "count", len(images), "agent", l.id, "session", req.SessionKey)
+		}
+		// Clean up temp media files — they're now base64-encoded in memory.
+		for _, p := range req.Media {
+			if err := os.Remove(p); err != nil {
+				slog.Debug("vision: failed to clean temp media file", "path", p, "error", err)
+			}
+		}
+	}
+
+	// 3. Buffer new messages — write to session only AFTER the run completes.
 	// This prevents concurrent runs from seeing each other's in-progress messages.
+	// NOTE: pendingMsgs stores TEXT ONLY (no images) to avoid bloating session storage.
 	var pendingMsgs []providers.Message
 	pendingMsgs = append(pendingMsgs, providers.Message{
 		Role:    "user",
 		Content: req.Message,
 	})

-	// 3. Run LLM iteration loop
+	// 4. Run LLM iteration loop
 	var totalUsage providers.Usage
 	iteration := 0
 	var finalContent string
-	var asyncToolCalls []string // track async spawn tool names for fallback
+	var asyncToolCalls []string  // track async spawn tool names for fallback
+	var mediaResults []MediaResult // media files from tool MEDIA: results

 	for iteration < l.maxIterations {
 		iteration++
@@ -533,7 +569,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 			toolSpanStart := time.Now().UTC()
 			result := l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)

-			l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result.ForLLM, result.IsError)
+			l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result)

 			if result.Async {
 				asyncToolCalls = append(asyncToolCalls, tc.Name)
@@ -558,6 +594,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 				},
 			})

+			// Collect MEDIA: paths from tool results
+			if mr := parseMediaResult(result.ForLLM); mr != nil {
+				mediaResults = append(mediaResults, *mr)
+			}
+
 			toolMsg := providers.Message{
 				Role:       "tool",
 				Content:    result.ForLLM,
@@ -619,7 +660,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)

 			// 5. Process results sequentially: emit events, append messages, save to session
 			for _, r := range collected {
-				l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result.ForLLM, r.result.IsError)
+				l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result)

 				if r.result.Async {
 					asyncToolCalls = append(asyncToolCalls, r.tc.Name)
@@ -644,6 +685,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 					},
 				})

+				// Collect MEDIA: paths from tool results
+				if mr := parseMediaResult(r.result.ForLLM); mr != nil {
+					mediaResults = append(mediaResults, *mr)
+				}
+
 				toolMsg := providers.Message{
 					Role:       "tool",
 					Content:    r.result.ForLLM,
@@ -723,9 +769,69 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
 		RunID:      req.RunID,
 		Iterations: iteration,
 		Usage:      &totalUsage,
+		Media:      mediaResults,
 	}, nil
 }

+// parseMediaResult extracts a MediaResult from a tool result string containing "MEDIA:" prefix.
+// Handles formats: "MEDIA:/path/to/file" and "[[audio_as_voice]]\nMEDIA:/path/to/file".
+// Returns nil if no MEDIA: prefix is found.
+func parseMediaResult(toolOutput string) *MediaResult {
+	s := toolOutput
+	asVoice := false
+
+	// Check for [[audio_as_voice]] tag (TTS voice messages)
+	if strings.Contains(s, "[[audio_as_voice]]") {
+		asVoice = true
+		s = strings.ReplaceAll(s, "[[audio_as_voice]]", "")
+		s = strings.TrimSpace(s)
+	}
+
+	// Find MEDIA: prefix
+	idx := strings.Index(s, "MEDIA:")
+	if idx < 0 {
+		return nil
+	}
+	path := strings.TrimSpace(s[idx+6:])
+	if path == "" {
+		return nil
+	}
+	// Take only the first line (in case there's trailing text)
+	if nl := strings.IndexByte(path, '\n'); nl >= 0 {
+		path = strings.TrimSpace(path[:nl])
+	}
+
+	return &MediaResult{
+		Path:        path,
+		ContentType: mimeFromExt(filepath.Ext(path)),
+		AsVoice:     asVoice,
+	}
+}
+
+// mimeFromExt returns a MIME type for common media file extensions.
+func mimeFromExt(ext string) string {
+	switch strings.ToLower(ext) {
+	case ".png":
+		return "image/png"
+	case ".jpg", ".jpeg":
+		return "image/jpeg"
+	case ".gif":
+		return "image/gif"
+	case ".webp":
+		return "image/webp"
+	case ".mp4":
+		return "video/mp4"
+	case ".ogg", ".opus":
+		return "audio/ogg"
+	case ".mp3":
+		return "audio/mpeg"
+	case ".wav":
+		return "audio/wav"
+	default:
+		return "application/octet-stream"
+	}
+}
+
 // sanitizePathSegment makes a userID safe for use as a directory name.
 // Replaces colons, spaces, and other unsafe chars with underscores.
 func sanitizePathSegment(s string) string {
@@ -12,6 +12,7 @@ import (

 	"github.com/nextlevelbuilder/goclaw/internal/providers"
 	"github.com/nextlevelbuilder/goclaw/internal/store"
+	"github.com/nextlevelbuilder/goclaw/internal/tools"
 	"github.com/nextlevelbuilder/goclaw/internal/tracing"
 )

@@ -61,10 +62,22 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
 		span.AgentID = &l.agentUUID
 	}

-	// Verbose mode: serialize full messages and output
+	// Verbose mode: serialize full messages and output.
+	// Strip base64 image data to avoid bloating traces and PostgreSQL encoding issues.
 	verbose := collector.Verbose()
 	if verbose && len(messages) > 0 {
-		if b, err := json.Marshal(messages); err == nil {
+		stripped := make([]providers.Message, len(messages))
+		copy(stripped, messages)
+		for i := range stripped {
+			if len(stripped[i].Images) > 0 {
+				placeholder := make([]providers.ImageContent, len(stripped[i].Images))
+				for j, img := range stripped[i].Images {
+					placeholder[j] = providers.ImageContent{MimeType: img.MimeType, Data: fmt.Sprintf("[base64 %s, %d bytes]", img.MimeType, len(img.Data))}
+				}
+				stripped[i].Images = placeholder
+			}
+		}
+		if b, err := json.Marshal(stripped); err == nil {
 			span.InputPreview = truncateStr(string(b), 100000)
 		}
 	}
@@ -98,7 +111,8 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
 }

 // emitToolSpan records a tool call span if tracing is active.
-func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input, output string, isError bool) {
+// result is the full tool execution result, which may contain Usage from inner LLM calls.
+func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input string, result *tools.Result) {
 	traceID := tracing.TraceIDFromContext(ctx)
 	collector := tracing.CollectorFromContext(ctx)
 	if collector == nil || traceID == uuid.Nil {
@@ -121,7 +135,7 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
 		ToolName:      toolName,
 		ToolCallID:    toolCallID,
 		InputPreview:  truncateStr(input, previewLimit),
-		OutputPreview: truncateStr(output, previewLimit),
+		OutputPreview: truncateStr(result.ForLLM, previewLimit),
 		Status:        store.SpanStatusCompleted,
 		Level:         "DEFAULT",
 		CreatedAt:     now,
@@ -132,9 +146,26 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
 	if l.agentUUID != uuid.Nil {
 		span.AgentID = &l.agentUUID
 	}
-	if isError {
+	if result.IsError {
 		span.Status = store.SpanStatusError
-		span.Error = truncateStr(output, 200)
+		span.Error = truncateStr(result.ForLLM, 200)
+	}
+
+	// Record token usage from tools that make internal LLM calls (e.g. read_image).
+	if result.Usage != nil {
+		span.InputTokens = result.Usage.PromptTokens
+		span.OutputTokens = result.Usage.CompletionTokens
+		span.Provider = result.Provider
+		span.Model = result.Model
+		if result.Usage.CacheCreationTokens > 0 || result.Usage.CacheReadTokens > 0 {
+			meta := map[string]int{
+				"cache_creation_tokens": result.Usage.CacheCreationTokens,
+				"cache_read_tokens":     result.Usage.CacheReadTokens,
+			}
+			if b, err := json.Marshal(meta); err == nil {
+				span.Metadata = b
+			}
+		}
 	}

 	collector.EmitSpan(span)
@@ -0,0 +1,62 @@
+package agent
+
+import (
+	"encoding/base64"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/nextlevelbuilder/goclaw/internal/providers"
+)
+
+// maxImageBytes is the safety limit for reading image files (10MB).
+const maxImageBytes = 10 * 1024 * 1024
+
+// loadImages reads local image files and returns base64-encoded ImageContent slices.
+// Non-image files and files that fail to read are skipped with a warning log.
+func loadImages(paths []string) []providers.ImageContent {
+	if len(paths) == 0 {
+		return nil
+	}
+
+	var images []providers.ImageContent
+	for _, p := range paths {
+		mime := inferImageMime(p)
+		if mime == "" {
+			continue
+		}
+
+		data, err := os.ReadFile(p)
+		if err != nil {
+			slog.Warn("vision: failed to read image file", "path", p, "error", err)
+			continue
+		}
+		if len(data) > maxImageBytes {
+			slog.Warn("vision: image file too large, skipping", "path", p, "size", len(data))
+			continue
+		}
+
+		images = append(images, providers.ImageContent{
+			MimeType: mime,
+			Data:     base64.StdEncoding.EncodeToString(data),
+		})
+	}
+	return images
+}
+
+// inferImageMime returns the MIME type for supported image extensions, or "" if not an image.
+func inferImageMime(path string) string {
+	switch strings.ToLower(filepath.Ext(path)) {
+	case ".jpg", ".jpeg":
+		return "image/jpeg"
+	case ".png":
+		return "image/png"
+	case ".gif":
+		return "image/gif"
+	case ".webp":
+		return "image/webp"
+	default:
+		return ""
+	}
+}
@@ -53,7 +53,10 @@ func SanitizeAssistantContent(content string) string {
 	// 6. Collapse consecutive duplicate blocks
 	content = collapseConsecutiveDuplicateBlocks(content)

-	// 7. Strip leading blank lines (preserve indentation)
+	// 7. Strip MEDIA: paths from LLM output (media delivered separately)
+	content = stripMediaPaths(content)
+
+	// 8. Strip leading blank lines (preserve indentation)
 	content = stripLeadingBlankLines(content)

 	content = strings.TrimSpace(content)
@@ -277,7 +280,28 @@ func collapseConsecutiveDuplicateBlocks(content string) string {
 	return collapsed
 }

-// --- 7. Strip leading blank lines ---
+// --- 7. Strip MEDIA: paths ---
+
+// stripMediaPaths removes lines containing MEDIA:/path references from LLM output.
+// These are tool result artifacts that should not appear in user-facing text
+// (media files are delivered separately via OutboundMessage.Media).
+func stripMediaPaths(content string) string {
+	if !strings.Contains(content, "MEDIA:") {
+		return content
+	}
+	lines := strings.Split(content, "\n")
+	var result []string
+	for _, line := range lines {
+		trimmed := strings.TrimSpace(line)
+		if strings.HasPrefix(trimmed, "MEDIA:") || strings.HasPrefix(trimmed, "[[audio_as_voice]]") {
+			continue
+		}
+		result = append(result, line)
+	}
+	return strings.TrimSpace(strings.Join(result, "\n"))
+}
+
+// --- 8. Strip leading blank lines ---

 var leadingBlankLinesPattern = regexp.MustCompile(`^(?:[ \t]*\r?\n)+`)

@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
+	"os"
 	"sync"

 	"github.com/nextlevelbuilder/goclaw/internal/bus"
@@ -130,6 +131,16 @@ func (m *Manager) dispatchOutbound(ctx context.Context) {
 					"error", err,
 				)
 			}
+
+			// Clean up temporary media files after successful (or failed) send.
+			// Files are created by tools (create_image, tts) and only needed for the send.
+			for _, media := range msg.Media {
+				if media.URL != "" {
+					if err := os.Remove(media.URL); err != nil {
+						slog.Debug("failed to clean up media file", "path", media.URL, "error", err)
+					}
+				}
+			}
 		}
 	}
 }
@@ -358,30 +358,49 @@ func (c *Channel) handleMessage(ctx context.Context, update telego.Update) {
 }

 // detectMention checks if a Telegram message mentions the bot.
+// Checks both msg.Text/Entities (text messages) and msg.Caption/CaptionEntities (photo/media messages).
 func (c *Channel) detectMention(msg *telego.Message, botUsername string) bool {
 	if botUsername == "" {
 		return false
 	}
+	lowerBot := strings.ToLower(botUsername)

-	for _, entity := range msg.Entities {
-		if entity.Type == "mention" && msg.Text != "" {
-			mentioned := msg.Text[entity.Offset : entity.Offset+entity.Length]
-			if strings.EqualFold(mentioned, "@"+botUsername) {
-				return true
-			}
+	// Check both text entities and caption entities (photos use Caption, not Text).
+	for _, pair := range []struct {
+		entities []telego.MessageEntity
+		text     string
+	}{
+		{msg.Entities, msg.Text},
+		{msg.CaptionEntities, msg.Caption},
+	} {
+		if pair.text == "" {
+			continue
 		}
-		if entity.Type == "bot_command" && msg.Text != "" {
-			cmdText := msg.Text[entity.Offset : entity.Offset+entity.Length]
-			if strings.Contains(strings.ToLower(cmdText), "@"+strings.ToLower(botUsername)) {
-				return true
+		for _, entity := range pair.entities {
+			if entity.Type == "mention" {
+				mentioned := pair.text[entity.Offset : entity.Offset+entity.Length]
+				if strings.EqualFold(mentioned, "@"+botUsername) {
+					return true
+				}
+			}
+			if entity.Type == "bot_command" {
+				cmdText := pair.text[entity.Offset : entity.Offset+entity.Length]
+				if strings.Contains(strings.ToLower(cmdText), "@"+lowerBot) {
+					return true
+				}
 			}
 		}
 	}

-	if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+strings.ToLower(botUsername)) {
+	// Fallback: substring check in both text and caption
+	if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+lowerBot) {
+		return true
+	}
+	if msg.Caption != "" && strings.Contains(strings.ToLower(msg.Caption), "@"+lowerBot) {
 		return true
 	}

+	// Reply to bot's message = implicit mention
 	if msg.ReplyToMessage != nil && msg.ReplyToMessage.From != nil {
 		if msg.ReplyToMessage.From.Username == botUsername {
 			return true
@@ -186,6 +186,22 @@ type ToolPolicySpec struct {
 	Deny       []string                   `json:"deny,omitempty"`
 	AlsoAllow  []string                   `json:"alsoAllow,omitempty"`
 	ByProvider map[string]*ToolPolicySpec `json:"byProvider,omitempty"`
+	Vision     *VisionConfig              `json:"vision,omitempty"`   // per-agent vision provider/model override
+	ImageGen   *ImageGenConfig            `json:"imageGen,omitempty"` // per-agent image generation config
+}
+
+// VisionConfig configures the provider and model for vision tools (read_image).
+type VisionConfig struct {
+	Provider string `json:"provider,omitempty"` // e.g. "gemini", "anthropic"
+	Model    string `json:"model,omitempty"`    // e.g. "gemini-2.0-flash"
+}
+
+// ImageGenConfig configures the provider and model for image generation (create_image).
+type ImageGenConfig struct {
+	Provider string `json:"provider,omitempty"` // provider with image gen API (e.g. "openrouter")
+	Model    string `json:"model,omitempty"`    // e.g. "google/gemini-2.5-flash-image-preview"
+	Size     string `json:"size,omitempty"`     // default aspect ratio / size
+	Quality  string `json:"quality,omitempty"`  // "standard" or "hd"
 }

 type WebToolsConfig struct {
@@ -220,10 +220,34 @@ func (p *AnthropicProvider) buildRequestBody(model string, req ChatRequest, stre
 			})

 		case "user":
-			messages = append(messages, map[string]interface{}{
-				"role":    "user",
-				"content": msg.Content,
-			})
+			if len(msg.Images) > 0 {
+				var blocks []map[string]interface{}
+				for _, img := range msg.Images {
+					blocks = append(blocks, map[string]interface{}{
+						"type": "image",
+						"source": map[string]interface{}{
+							"type":       "base64",
+							"media_type": img.MimeType,
+							"data":       img.Data,
+						},
+					})
+				}
+				if msg.Content != "" {
+					blocks = append(blocks, map[string]interface{}{
+						"type": "text",
+						"text": msg.Content,
+					})
+				}
+				messages = append(messages, map[string]interface{}{
+					"role":    "user",
+					"content": blocks,
+				})
+			} else {
+				messages = append(messages, map[string]interface{}{
+					"role":    "user",
+					"content": msg.Content,
+				})
+			}

 		case "assistant":
 			var blocks []map[string]interface{}
@@ -18,6 +18,7 @@ type OpenAIProvider struct {
 	name         string
 	apiKey       string
 	apiBase      string
+	chatPath     string // defaults to "/chat/completions"
 	defaultModel string
 	client       *http.Client
 	retryConfig  RetryConfig
@@ -33,14 +34,23 @@ func NewOpenAIProvider(name, apiKey, apiBase, defaultModel string) *OpenAIProvid
 		name:         name,
 		apiKey:       apiKey,
 		apiBase:      apiBase,
+		chatPath:     "/chat/completions",
 		defaultModel: defaultModel,
 		client:       &http.Client{Timeout: 120 * time.Second},
 		retryConfig:  DefaultRetryConfig(),
 	}
 }

+// WithChatPath returns a copy with a custom chat completions path (e.g. "/text/chatcompletion_v2" for MiniMax native API).
+func (p *OpenAIProvider) WithChatPath(path string) *OpenAIProvider {
+	p.chatPath = path
+	return p
+}
+
 func (p *OpenAIProvider) Name() string        { return p.name }
 func (p *OpenAIProvider) DefaultModel() string { return p.defaultModel }
+func (p *OpenAIProvider) APIKey() string       { return p.apiKey }
+func (p *OpenAIProvider) APIBase() string      { return p.apiBase }

 // resolveModel returns the model ID to use for a request.
 // For OpenRouter, model IDs require a provider prefix (e.g. "anthropic/claude-sonnet-4-5-20250929").
@@ -184,7 +194,24 @@ func (p *OpenAIProvider) buildRequestBody(model string, req ChatRequest, stream

 		// Include content; omit empty content for assistant messages with tool_calls
 		// (Gemini rejects empty content → "must include at least one parts field").
-		if m.Content != "" || len(m.ToolCalls) == 0 {
+		if m.Role == "user" && len(m.Images) > 0 {
+			var parts []map[string]interface{}
+			for _, img := range m.Images {
+				parts = append(parts, map[string]interface{}{
+					"type": "image_url",
+					"image_url": map[string]interface{}{
+						"url": fmt.Sprintf("data:%s;base64,%s", img.MimeType, img.Data),
+					},
+				})
+			}
+			if m.Content != "" {
+				parts = append(parts, map[string]interface{}{
+					"type": "text",
+					"text": m.Content,
+				})
+			}
+			msg["content"] = parts
+		} else if m.Content != "" || len(m.ToolCalls) == 0 {
 			msg["content"] = m.Content
 		}

@@ -247,7 +274,7 @@ func (p *OpenAIProvider) doRequest(ctx context.Context, body interface{}) (io.Re
 		return nil, fmt.Errorf("%s: marshal request: %w", p.name, err)
 	}

-	httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+"/chat/completions", bytes.NewReader(data))
+	httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+p.chatPath, bytes.NewReader(data))
 	if err != nil {
 		return nil, fmt.Errorf("%s: create request: %w", p.name, err)
 	}
@@ -42,12 +42,19 @@ type StreamChunk struct {
 	Done      bool   `json:"done,omitempty"`
 }

+// ImageContent represents a base64-encoded image for vision-capable models.
+type ImageContent struct {
+	MimeType string `json:"mime_type"` // e.g. "image/jpeg"
+	Data     string `json:"data"`      // base64-encoded image bytes
+}
+
 // Message represents a conversation message.
 type Message struct {
-	Role       string     `json:"role"`                  // "system", "user", "assistant", "tool"
-	Content    string     `json:"content"`
-	ToolCalls  []ToolCall `json:"tool_calls,omitempty"`
-	ToolCallID string     `json:"tool_call_id,omitempty"` // for role="tool" responses
+	Role       string         `json:"role"`                  // "system", "user", "assistant", "tool"
+	Content    string         `json:"content"`
+	Images     []ImageContent `json:"images,omitempty"`      // vision: base64 images
+	ToolCalls  []ToolCall     `json:"tool_calls,omitempty"`
+	ToolCallID string         `json:"tool_call_id,omitempty"` // for role="tool" responses
 }

 // ToolCall represents a tool invocation requested by the LLM.
@@ -1,6 +1,10 @@
 package tools

-import "context"
+import (
+	"context"
+
+	"github.com/nextlevelbuilder/goclaw/internal/config"
+)

 // Tool execution context keys.
 // These replace mutable setter fields on tool instances, making tools thread-safe
@@ -71,3 +75,28 @@ func ToolWorkspaceFromCtx(ctx context.Context) string {
 	v, _ := ctx.Value(ctxWorkspace).(string)
 	return v
 }
+
+// --- Vision / ImageGen config (per-agent overrides) ---
+
+const (
+	ctxVisionConfig   toolContextKey = "tool_vision_config"
+	ctxImageGenConfig toolContextKey = "tool_imagegen_config"
+)
+
+func WithVisionConfig(ctx context.Context, cfg *config.VisionConfig) context.Context {
+	return context.WithValue(ctx, ctxVisionConfig, cfg)
+}
+
+func VisionConfigFromCtx(ctx context.Context) *config.VisionConfig {
+	v, _ := ctx.Value(ctxVisionConfig).(*config.VisionConfig)
+	return v
+}
+
+func WithImageGenConfig(ctx context.Context, cfg *config.ImageGenConfig) context.Context {
+	return context.WithValue(ctx, ctxImageGenConfig, cfg)
+}
+
+func ImageGenConfigFromCtx(ctx context.Context) *config.ImageGenConfig {
+	v, _ := ctx.Value(ctxImageGenConfig).(*config.ImageGenConfig)
+	return v
+}
@@ -0,0 +1,288 @@
+package tools
+
+import (
+	"bytes"
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/nextlevelbuilder/goclaw/internal/providers"
+)
+
+// credentialProvider is a narrow interface for providers that expose API credentials.
+type credentialProvider interface {
+	APIKey() string
+	APIBase() string
+}
+
+// imageGenProviderPriority is the default order for image generation providers.
+var imageGenProviderPriority = []string{"openrouter", "gemini", "openai"}
+
+// imageGenModelDefaults maps provider names to default image generation models.
+var imageGenModelDefaults = map[string]string{
+	"openrouter": "google/gemini-2.5-flash-image",
+	"openai":     "dall-e-3",
+	"gemini":     "gemini-2.0-flash-exp",
+}
+
+// CreateImageTool generates images using an image generation API.
+// Uses OpenRouter (Gemini image model) or OpenAI (DALL-E) via per-agent ImageGenConfig.
+type CreateImageTool struct {
+	registry *providers.Registry
+}
+
+func NewCreateImageTool(registry *providers.Registry) *CreateImageTool {
+	return &CreateImageTool{registry: registry}
+}
+
+func (t *CreateImageTool) Name() string { return "create_image" }
+
+func (t *CreateImageTool) Description() string {
+	return "Generate an image from a text description using an image generation model. Returns a MEDIA: path to the generated image file."
+}
+
+func (t *CreateImageTool) Parameters() map[string]interface{} {
+	return map[string]interface{}{
+		"type": "object",
+		"properties": map[string]interface{}{
+			"prompt": map[string]interface{}{
+				"type":        "string",
+				"description": "Text description of the image to generate.",
+			},
+			"aspect_ratio": map[string]interface{}{
+				"type":        "string",
+				"description": "Aspect ratio: '1:1' (default), '3:4', '4:3', '9:16', '16:9'.",
+			},
+		},
+		"required": []string{"prompt"},
+	}
+}
+
+func (t *CreateImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
+	prompt, _ := args["prompt"].(string)
+	if prompt == "" {
+		return ErrorResult("prompt is required")
+	}
+	aspectRatio, _ := args["aspect_ratio"].(string)
+	if aspectRatio == "" {
+		aspectRatio = "1:1"
+	}
+
+	// Resolve provider from per-agent config or defaults
+	providerName, model := t.resolveConfig(ctx)
+
+	p, err := t.registry.Get(providerName)
+	if err != nil {
+		return ErrorResult(fmt.Sprintf("image generation provider %q not available", providerName))
+	}
+
+	cp, ok := p.(credentialProvider)
+	if !ok {
+		return ErrorResult(fmt.Sprintf("provider %q does not expose API credentials for image generation", providerName))
+	}
+
+	slog.Info("create_image: calling image generation API",
+		"provider", providerName, "model", model, "aspect_ratio", aspectRatio)
+
+	imageBytes, usage, err := t.callImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, aspectRatio)
+	if err != nil {
+		return ErrorResult(fmt.Sprintf("image generation failed: %v", err))
+	}
+
+	// Save to temp file
+	imagePath := filepath.Join(os.TempDir(), fmt.Sprintf("goclaw_gen_%d.png", time.Now().UnixNano()))
+	if err := os.WriteFile(imagePath, imageBytes, 0644); err != nil {
+		return ErrorResult(fmt.Sprintf("failed to save generated image: %v", err))
+	}
+
+	result := &Result{ForLLM: fmt.Sprintf("MEDIA:%s", imagePath)}
+	result.Provider = providerName
+	result.Model = model
+	if usage != nil {
+		result.Usage = usage
+	}
+	return result
+}
+
+// resolveConfig returns the provider name and model to use for image generation.
+func (t *CreateImageTool) resolveConfig(ctx context.Context) (providerName, model string) {
+	// 1. Check per-agent ImageGenConfig from context
+	if cfg := ImageGenConfigFromCtx(ctx); cfg != nil {
+		if cfg.Provider != "" {
+			providerName = cfg.Provider
+		}
+		if cfg.Model != "" {
+			model = cfg.Model
+		}
+	}
+
+	// 2. If provider not set, find first available from priority list
+	if providerName == "" {
+		for _, name := range imageGenProviderPriority {
+			if _, err := t.registry.Get(name); err == nil {
+				providerName = name
+				break
+			}
+		}
+	}
+	if providerName == "" {
+		providerName = "openrouter" // fallback even if unavailable (error handled later)
+	}
+
+	// 3. If model not set, use default for this provider
+	if model == "" {
+		if m, ok := imageGenModelDefaults[providerName]; ok {
+			model = m
+		}
+	}
+
+	return providerName, model
+}
+
+// callImageGenAPI calls the OpenAI-compatible image generation endpoint.
+// Works with OpenRouter (modalities: ["image","text"]) and OpenAI (/images/generations).
+func (t *CreateImageTool) callImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt, aspectRatio string) ([]byte, *providers.Usage, error) {
+	// OpenRouter / OpenAI-compat: use chat completions with modalities
+	body := map[string]interface{}{
+		"model": model,
+		"messages": []map[string]interface{}{
+			{"role": "user", "content": prompt},
+		},
+		"modalities": []string{"image", "text"},
+	}
+	if aspectRatio != "" && aspectRatio != "1:1" {
+		body["image_config"] = map[string]interface{}{
+			"aspect_ratio": aspectRatio,
+		}
+	}
+
+	jsonBody, err := json.Marshal(body)
+	if err != nil {
+		return nil, nil, fmt.Errorf("marshal request: %w", err)
+	}
+
+	url := strings.TrimRight(apiBase, "/") + "/chat/completions"
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
+	if err != nil {
+		return nil, nil, fmt.Errorf("create request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+apiKey)
+
+	client := &http.Client{Timeout: 120 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, nil, fmt.Errorf("http request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, nil, fmt.Errorf("read response: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
+	}
+
+	return t.parseImageResponse(respBody)
+}
+
+// parseImageResponse extracts base64 image data from the OpenAI-compat chat response.
+// Looks for images in choices[0].message.content (multipart) or choices[0].message.images.
+func (t *CreateImageTool) parseImageResponse(respBody []byte) ([]byte, *providers.Usage, error) {
+	var resp struct {
+		Choices []struct {
+			Message struct {
+				Content interface{} `json:"content"`
+				Images  []struct {
+					ImageURL struct {
+						URL string `json:"url"`
+					} `json:"image_url"`
+				} `json:"images"`
+			} `json:"message"`
+		} `json:"choices"`
+		Usage *struct {
+			PromptTokens     int `json:"prompt_tokens"`
+			CompletionTokens int `json:"completion_tokens"`
+			TotalTokens      int `json:"total_tokens"`
+		} `json:"usage"`
+	}
+
+	if err := json.Unmarshal(respBody, &resp); err != nil {
+		return nil, nil, fmt.Errorf("parse response: %w", err)
+	}
+
+	if len(resp.Choices) == 0 {
+		return nil, nil, fmt.Errorf("no choices in response")
+	}
+
+	msg := resp.Choices[0].Message
+
+	// Try images array first (OpenRouter format)
+	for _, img := range msg.Images {
+		if imageBytes, err := decodeDataURL(img.ImageURL.URL); err == nil {
+			return imageBytes, convertUsage(resp.Usage), nil
+		}
+	}
+
+	// Try multipart content array (some providers return content as array of parts)
+	if parts, ok := msg.Content.([]interface{}); ok {
+		for _, part := range parts {
+			if m, ok := part.(map[string]interface{}); ok {
+				if m["type"] == "image_url" {
+					if imgURL, ok := m["image_url"].(map[string]interface{}); ok {
+						if url, ok := imgURL["url"].(string); ok {
+							if imageBytes, err := decodeDataURL(url); err == nil {
+								return imageBytes, convertUsage(resp.Usage), nil
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	return nil, nil, fmt.Errorf("no image data found in response")
+}
+
+// decodeDataURL decodes a data:image/...;base64,... URL into raw bytes.
+func decodeDataURL(dataURL string) ([]byte, error) {
+	// Format: data:image/png;base64,iVBORw0KGgo...
+	idx := strings.Index(dataURL, ";base64,")
+	if idx < 0 {
+		return nil, fmt.Errorf("not a base64 data URL")
+	}
+	b64 := dataURL[idx+8:]
+	return base64.StdEncoding.DecodeString(b64)
+}
+
+func convertUsage(u *struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}) *providers.Usage {
+	if u == nil {
+		return nil
+	}
+	return &providers.Usage{
+		PromptTokens:     u.PromptTokens,
+		CompletionTokens: u.CompletionTokens,
+		TotalTokens:      u.TotalTokens,
+	}
+}
+
+func truncateBytes(b []byte, max int) string {
+	if len(b) <= max {
+		return string(b)
+	}
+	return string(b[:max]) + "..."
+}
@@ -25,7 +25,7 @@ var toolGroups = map[string][]string{
 		"browser", "canvas", "nodes", "cron", "message", "gateway",
 		"agents_list", "sessions_list", "sessions_history", "sessions_send",
 		"sessions_spawn", "subagents", "session_status",
-		"memory_search", "memory_get", "web_search", "web_fetch", "image",
+		"memory_search", "memory_get", "web_search", "web_fetch", "read_image", "create_image",
 	},
 }

@@ -49,7 +49,7 @@ func UnregisterToolGroup(name string) {
 // Tool profiles define preset allow sets.
 var toolProfiles = map[string][]string{
 	"minimal":   {"session_status"},
-	"coding":    {"group:fs", "group:runtime", "group:sessions", "group:memory", "image"},
+	"coding":    {"group:fs", "group:runtime", "group:sessions", "group:memory", "read_image", "create_image"},
 	"messaging": {"group:messaging", "sessions_list", "sessions_history", "sessions_send", "session_status"},
 	"full":      {}, // empty = no restrictions
 }
@@ -0,0 +1,139 @@
+package tools
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+
+	"github.com/nextlevelbuilder/goclaw/internal/providers"
+)
+
+// --- Context helpers for media images ---
+
+const ctxMediaImages toolContextKey = "tool_media_images"
+
+// WithMediaImages stores base64-encoded images in context for read_image tool access.
+func WithMediaImages(ctx context.Context, images []providers.ImageContent) context.Context {
+	return context.WithValue(ctx, ctxMediaImages, images)
+}
+
+// MediaImagesFromCtx retrieves stored images from context.
+func MediaImagesFromCtx(ctx context.Context) []providers.ImageContent {
+	v, _ := ctx.Value(ctxMediaImages).([]providers.ImageContent)
+	return v
+}
+
+// --- ReadImageTool ---
+
+// visionProviderPriority is the order in which providers are tried for vision.
+var visionProviderPriority = []string{"gemini", "anthropic", "openrouter"}
+
+// visionModelOverrides maps provider names to preferred vision models.
+// Providers not listed here use their default model.
+var visionModelOverrides = map[string]string{
+	"openrouter": "google/gemini-2.0-flash-001",
+}
+
+// ReadImageTool uses a vision-capable provider to describe images attached to the current message.
+type ReadImageTool struct {
+	registry *providers.Registry
+}
+
+func NewReadImageTool(registry *providers.Registry) *ReadImageTool {
+	return &ReadImageTool{registry: registry}
+}
+
+func (t *ReadImageTool) Name() string { return "read_image" }
+
+func (t *ReadImageTool) Description() string {
+	return "Analyze images attached to the current message using a vision model. Use this when you see <media:image> tags but cannot view images directly."
+}
+
+func (t *ReadImageTool) Parameters() map[string]interface{} {
+	return map[string]interface{}{
+		"type": "object",
+		"properties": map[string]interface{}{
+			"prompt": map[string]interface{}{
+				"type":        "string",
+				"description": "What you want to know about the image(s). E.g. 'Describe this image in detail' or 'What text is in this image?'",
+			},
+		},
+		"required": []string{"prompt"},
+	}
+}
+
+func (t *ReadImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
+	prompt, _ := args["prompt"].(string)
+	if prompt == "" {
+		prompt = "Describe this image in detail."
+	}
+
+	images := MediaImagesFromCtx(ctx)
+	if len(images) == 0 {
+		return ErrorResult("No images available in this conversation. The user may not have sent an image.")
+	}
+
+	// Find a vision-capable provider (per-agent config > hardcoded priority)
+	provider, model, err := t.resolveVisionProviderWithConfig(ctx)
+	if err != nil {
+		return ErrorResult(err.Error())
+	}
+
+	slog.Info("read_image: calling vision provider", "provider", provider.Name(), "model", model, "images", len(images))
+
+	resp, err := provider.Chat(ctx, providers.ChatRequest{
+		Messages: []providers.Message{
+			{
+				Role:    "user",
+				Content: prompt,
+				Images:  images,
+			},
+		},
+		Model: model,
+		Options: map[string]interface{}{
+			"max_tokens":  1024,
+			"temperature": 0.3,
+		},
+	})
+	if err != nil {
+		return ErrorResult(fmt.Sprintf("Vision provider error: %v", err))
+	}
+
+	result := NewResult(resp.Content)
+	result.Usage = resp.Usage
+	result.Provider = provider.Name()
+	result.Model = model
+	return result
+}
+
+// resolveVisionProviderWithConfig checks per-agent VisionConfig first, then falls back to hardcoded priority.
+func (t *ReadImageTool) resolveVisionProviderWithConfig(ctx context.Context) (providers.Provider, string, error) {
+	if cfg := VisionConfigFromCtx(ctx); cfg != nil && cfg.Provider != "" {
+		p, err := t.registry.Get(cfg.Provider)
+		if err != nil {
+			return nil, "", fmt.Errorf("configured vision provider %q not available: %w", cfg.Provider, err)
+		}
+		model := cfg.Model
+		if model == "" {
+			model = p.DefaultModel()
+		}
+		return p, model, nil
+	}
+	return t.resolveVisionProvider()
+}
+
+// resolveVisionProvider finds the first available vision-capable provider.
+func (t *ReadImageTool) resolveVisionProvider() (providers.Provider, string, error) {
+	for _, name := range visionProviderPriority {
+		p, err := t.registry.Get(name)
+		if err != nil {
+			continue
+		}
+		model := p.DefaultModel()
+		if override, ok := visionModelOverrides[name]; ok {
+			model = override
+		}
+		return p, model, nil
+	}
+	return nil, "", fmt.Errorf("no vision-capable provider available (need one of: %v)", visionProviderPriority)
+}
@@ -1,5 +1,7 @@
 package tools

+import "github.com/nextlevelbuilder/goclaw/internal/providers"
+
 // Result is the unified return type from tool execution.
 type Result struct {
 	ForLLM  string `json:"for_llm"`            // content sent to the LLM
@@ -8,6 +10,12 @@ type Result struct {
 	IsError bool   `json:"is_error"`            // marks error
 	Async   bool   `json:"async"`               // running asynchronously
 	Err     error  `json:"-"`                   // internal error (not serialized)
+
+	// Usage holds token usage from tools that make internal LLM calls (e.g. read_image).
+	// When set, the agent loop records these on the tool span for tracing.
+	Usage    *providers.Usage `json:"-"`
+	Provider string           `json:"-"` // provider name (for tool span metadata)
+	Model    string           `json:"-"` // model used (for tool span metadata)
 }

 func NewResult(forLLM string) *Result {
@@ -255,7 +255,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
            {span.input_preview && (
              <div>
                <p className="text-xs text-muted-foreground">Input:</p>
-                <pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
+                <pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
                  {span.input_preview}
                </pre>
              </div>
@@ -263,7 +263,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
            {span.output_preview && (
              <div>
                <p className="text-xs text-muted-foreground">Output:</p>
-                <pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
+                <pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
                  {span.output_preview}
                </pre>
              </div>