feat: Implement vision capabilities and image generation tools, adding media handling, dedicated configurations, and trace optimization for image data.

This commit is contained in:
viettranx
2026-02-26 22:28:27 +07:00
parent 112fddb401
commit d5cc5a745d
19 changed files with 865 additions and 46 deletions
+4
View File
@@ -182,6 +182,10 @@ func runGateway() {
toolsReg.Register(webFetchTool)
slog.Info("web_fetch tool enabled")
// Vision fallback tool (for non-vision providers like MiniMax)
toolsReg.Register(tools.NewReadImageTool(providerRegistry))
toolsReg.Register(tools.NewCreateImageTool(providerRegistry))
// TTS (text-to-speech) system
ttsMgr := setupTTS(cfg)
if ttsMgr != nil {
+19 -2
View File
@@ -150,6 +150,7 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
outCh := sched.ScheduleWithOpts(ctx, "main", agent.RunRequest{
SessionKey: sessionKey,
Message: msg.Content,
Media: msg.Media,
Channel: msg.Channel,
ChatID: msg.ChatID,
PeerKind: peerKind,
@@ -225,12 +226,28 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
}
// Publish response back to the channel
msgBus.PublishOutbound(bus.OutboundMessage{
outMsg := bus.OutboundMessage{
Channel: channel,
ChatID: chatID,
Content: outcome.Result.Content,
Metadata: meta,
})
}
// Convert media results from agent run to outbound media attachments
for _, mr := range outcome.Result.Media {
outMsg.Media = append(outMsg.Media, bus.MediaAttachment{
URL: mr.Path,
ContentType: mr.ContentType,
})
if mr.AsVoice {
if outMsg.Metadata == nil {
outMsg.Metadata = make(map[string]string)
}
outMsg.Metadata["audio_as_voice"] = "true"
}
}
msgBus.PublishOutbound(outMsg)
}(msg.Channel, msg.ChatID, sessionKey, runID, outMeta)
}
+9 -2
View File
@@ -3,6 +3,7 @@ package cmd
import (
"context"
"log/slog"
"strings"
"github.com/nextlevelbuilder/goclaw/internal/config"
"github.com/nextlevelbuilder/goclaw/internal/providers"
@@ -51,7 +52,8 @@ func registerProviders(registry *providers.Registry, cfg *config.Config) {
}
if cfg.Providers.MiniMax.APIKey != "" {
registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5"))
registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5").
WithChatPath("/text/chatcompletion_v2"))
slog.Info("registered provider", "name", "minimax")
}
@@ -82,7 +84,12 @@ func registerProvidersFromDB(registry *providers.Registry, provStore store.Provi
if p.ProviderType == "anthropic_native" {
registry.Register(providers.NewAnthropicProvider(p.APIKey))
} else {
registry.Register(providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, ""))
prov := providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, "")
// MiniMax native API uses a different chat path for vision support.
if p.Name == "minimax" && strings.Contains(p.APIBase, "minimax.io") {
prov.WithChatPath("/text/chatcompletion_v2")
}
registry.Register(prov)
}
slog.Info("registered provider from DB", "name", p.Name)
}
+114 -8
View File
@@ -220,6 +220,7 @@ func NewLoop(cfg LoopConfig) *Loop {
type RunRequest struct {
SessionKey string // composite key: agent:{agentId}:{channel}:{peerKind}:{chatId}
Message string // user message
Media []string // local file paths to images (already sanitized)
Channel string // source channel
ChatID string // source chat ID
PeerKind string // "direct" or "group" (for session key building and tool context)
@@ -235,10 +236,18 @@ type RunRequest struct {
// RunResult is the output of a completed agent run.
type RunResult struct {
Content string `json:"content"`
RunID string `json:"runId"`
Iterations int `json:"iterations"`
Content string `json:"content"`
RunID string `json:"runId"`
Iterations int `json:"iterations"`
Usage *providers.Usage `json:"usage,omitempty"`
Media []MediaResult `json:"media,omitempty"` // media files from tool results (MEDIA: prefix)
}
// MediaResult represents a media file produced by a tool during the agent run.
type MediaResult struct {
Path string `json:"path"` // local file path
ContentType string `json:"content_type,omitempty"` // MIME type
AsVoice bool `json:"as_voice,omitempty"` // send as voice message (Telegram OGG)
}
// Run processes a single message through the agent loop.
@@ -351,6 +360,15 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
if req.SenderID != "" {
ctx = store.WithSenderID(ctx, req.SenderID)
}
// Inject per-agent vision/imagegen config for read_image/create_image tools
if l.agentToolPolicy != nil {
if l.agentToolPolicy.Vision != nil {
ctx = tools.WithVisionConfig(ctx, l.agentToolPolicy.Vision)
}
if l.agentToolPolicy.ImageGen != nil {
ctx = tools.WithImageGenConfig(ctx, l.agentToolPolicy.ImageGen)
}
}
// Per-user workspace isolation.
// Each user gets a subdirectory within the agent's workspace.
@@ -430,19 +448,37 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
// (hadBootstrap) — no extra DB roundtrip needed for bootstrap detection.
messages, hadBootstrap := l.buildMessages(ctx, history, summary, req.Message, req.ExtraSystemPrompt, req.SessionKey, req.Channel, req.UserID, req.HistoryLimit)
// 2. Buffer new messages — write to session only AFTER the run completes.
// 2. Attach vision images to the current user message (last in messages slice).
// Images are only attached to the live request, NOT persisted in session history.
if len(req.Media) > 0 {
if images := loadImages(req.Media); len(images) > 0 {
messages[len(messages)-1].Images = images
ctx = tools.WithMediaImages(ctx, images) // make images available to read_image tool
slog.Info("vision: attached images to user message", "count", len(images), "agent", l.id, "session", req.SessionKey)
}
// Clean up temp media files — they're now base64-encoded in memory.
for _, p := range req.Media {
if err := os.Remove(p); err != nil {
slog.Debug("vision: failed to clean temp media file", "path", p, "error", err)
}
}
}
// 3. Buffer new messages — write to session only AFTER the run completes.
// This prevents concurrent runs from seeing each other's in-progress messages.
// NOTE: pendingMsgs stores TEXT ONLY (no images) to avoid bloating session storage.
var pendingMsgs []providers.Message
pendingMsgs = append(pendingMsgs, providers.Message{
Role: "user",
Content: req.Message,
})
// 3. Run LLM iteration loop
// 4. Run LLM iteration loop
var totalUsage providers.Usage
iteration := 0
var finalContent string
var asyncToolCalls []string // track async spawn tool names for fallback
var asyncToolCalls []string // track async spawn tool names for fallback
var mediaResults []MediaResult // media files from tool MEDIA: results
for iteration < l.maxIterations {
iteration++
@@ -533,7 +569,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
toolSpanStart := time.Now().UTC()
result := l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result.ForLLM, result.IsError)
l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result)
if result.Async {
asyncToolCalls = append(asyncToolCalls, tc.Name)
@@ -558,6 +594,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
},
})
// Collect MEDIA: paths from tool results
if mr := parseMediaResult(result.ForLLM); mr != nil {
mediaResults = append(mediaResults, *mr)
}
toolMsg := providers.Message{
Role: "tool",
Content: result.ForLLM,
@@ -619,7 +660,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
// 5. Process results sequentially: emit events, append messages, save to session
for _, r := range collected {
l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result.ForLLM, r.result.IsError)
l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result)
if r.result.Async {
asyncToolCalls = append(asyncToolCalls, r.tc.Name)
@@ -644,6 +685,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
},
})
// Collect MEDIA: paths from tool results
if mr := parseMediaResult(r.result.ForLLM); mr != nil {
mediaResults = append(mediaResults, *mr)
}
toolMsg := providers.Message{
Role: "tool",
Content: r.result.ForLLM,
@@ -723,9 +769,69 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
RunID: req.RunID,
Iterations: iteration,
Usage: &totalUsage,
Media: mediaResults,
}, nil
}
// parseMediaResult extracts a MediaResult from a tool result string containing "MEDIA:" prefix.
// Handles formats: "MEDIA:/path/to/file" and "[[audio_as_voice]]\nMEDIA:/path/to/file".
// Returns nil if no MEDIA: prefix is found.
func parseMediaResult(toolOutput string) *MediaResult {
s := toolOutput
asVoice := false
// Check for [[audio_as_voice]] tag (TTS voice messages)
if strings.Contains(s, "[[audio_as_voice]]") {
asVoice = true
s = strings.ReplaceAll(s, "[[audio_as_voice]]", "")
s = strings.TrimSpace(s)
}
// Find MEDIA: prefix
idx := strings.Index(s, "MEDIA:")
if idx < 0 {
return nil
}
path := strings.TrimSpace(s[idx+6:])
if path == "" {
return nil
}
// Take only the first line (in case there's trailing text)
if nl := strings.IndexByte(path, '\n'); nl >= 0 {
path = strings.TrimSpace(path[:nl])
}
return &MediaResult{
Path: path,
ContentType: mimeFromExt(filepath.Ext(path)),
AsVoice: asVoice,
}
}
// mimeFromExt returns a MIME type for common media file extensions.
func mimeFromExt(ext string) string {
switch strings.ToLower(ext) {
case ".png":
return "image/png"
case ".jpg", ".jpeg":
return "image/jpeg"
case ".gif":
return "image/gif"
case ".webp":
return "image/webp"
case ".mp4":
return "video/mp4"
case ".ogg", ".opus":
return "audio/ogg"
case ".mp3":
return "audio/mpeg"
case ".wav":
return "audio/wav"
default:
return "application/octet-stream"
}
}
// sanitizePathSegment makes a userID safe for use as a directory name.
// Replaces colons, spaces, and other unsafe chars with underscores.
func sanitizePathSegment(s string) string {
+37 -6
View File
@@ -12,6 +12,7 @@ import (
"github.com/nextlevelbuilder/goclaw/internal/providers"
"github.com/nextlevelbuilder/goclaw/internal/store"
"github.com/nextlevelbuilder/goclaw/internal/tools"
"github.com/nextlevelbuilder/goclaw/internal/tracing"
)
@@ -61,10 +62,22 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
span.AgentID = &l.agentUUID
}
// Verbose mode: serialize full messages and output
// Verbose mode: serialize full messages and output.
// Strip base64 image data to avoid bloating traces and PostgreSQL encoding issues.
verbose := collector.Verbose()
if verbose && len(messages) > 0 {
if b, err := json.Marshal(messages); err == nil {
stripped := make([]providers.Message, len(messages))
copy(stripped, messages)
for i := range stripped {
if len(stripped[i].Images) > 0 {
placeholder := make([]providers.ImageContent, len(stripped[i].Images))
for j, img := range stripped[i].Images {
placeholder[j] = providers.ImageContent{MimeType: img.MimeType, Data: fmt.Sprintf("[base64 %s, %d bytes]", img.MimeType, len(img.Data))}
}
stripped[i].Images = placeholder
}
}
if b, err := json.Marshal(stripped); err == nil {
span.InputPreview = truncateStr(string(b), 100000)
}
}
@@ -98,7 +111,8 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
}
// emitToolSpan records a tool call span if tracing is active.
func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input, output string, isError bool) {
// result is the full tool execution result, which may contain Usage from inner LLM calls.
func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input string, result *tools.Result) {
traceID := tracing.TraceIDFromContext(ctx)
collector := tracing.CollectorFromContext(ctx)
if collector == nil || traceID == uuid.Nil {
@@ -121,7 +135,7 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
ToolName: toolName,
ToolCallID: toolCallID,
InputPreview: truncateStr(input, previewLimit),
OutputPreview: truncateStr(output, previewLimit),
OutputPreview: truncateStr(result.ForLLM, previewLimit),
Status: store.SpanStatusCompleted,
Level: "DEFAULT",
CreatedAt: now,
@@ -132,9 +146,26 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
if l.agentUUID != uuid.Nil {
span.AgentID = &l.agentUUID
}
if isError {
if result.IsError {
span.Status = store.SpanStatusError
span.Error = truncateStr(output, 200)
span.Error = truncateStr(result.ForLLM, 200)
}
// Record token usage from tools that make internal LLM calls (e.g. read_image).
if result.Usage != nil {
span.InputTokens = result.Usage.PromptTokens
span.OutputTokens = result.Usage.CompletionTokens
span.Provider = result.Provider
span.Model = result.Model
if result.Usage.CacheCreationTokens > 0 || result.Usage.CacheReadTokens > 0 {
meta := map[string]int{
"cache_creation_tokens": result.Usage.CacheCreationTokens,
"cache_read_tokens": result.Usage.CacheReadTokens,
}
if b, err := json.Marshal(meta); err == nil {
span.Metadata = b
}
}
}
collector.EmitSpan(span)
+62
View File
@@ -0,0 +1,62 @@
package agent
import (
"encoding/base64"
"log/slog"
"os"
"path/filepath"
"strings"
"github.com/nextlevelbuilder/goclaw/internal/providers"
)
// maxImageBytes is the safety limit for reading image files (10MB).
const maxImageBytes = 10 * 1024 * 1024
// loadImages reads local image files and returns base64-encoded ImageContent slices.
// Non-image files and files that fail to read are skipped with a warning log.
func loadImages(paths []string) []providers.ImageContent {
if len(paths) == 0 {
return nil
}
var images []providers.ImageContent
for _, p := range paths {
mime := inferImageMime(p)
if mime == "" {
continue
}
data, err := os.ReadFile(p)
if err != nil {
slog.Warn("vision: failed to read image file", "path", p, "error", err)
continue
}
if len(data) > maxImageBytes {
slog.Warn("vision: image file too large, skipping", "path", p, "size", len(data))
continue
}
images = append(images, providers.ImageContent{
MimeType: mime,
Data: base64.StdEncoding.EncodeToString(data),
})
}
return images
}
// inferImageMime returns the MIME type for supported image extensions, or "" if not an image.
func inferImageMime(path string) string {
switch strings.ToLower(filepath.Ext(path)) {
case ".jpg", ".jpeg":
return "image/jpeg"
case ".png":
return "image/png"
case ".gif":
return "image/gif"
case ".webp":
return "image/webp"
default:
return ""
}
}
+26 -2
View File
@@ -53,7 +53,10 @@ func SanitizeAssistantContent(content string) string {
// 6. Collapse consecutive duplicate blocks
content = collapseConsecutiveDuplicateBlocks(content)
// 7. Strip leading blank lines (preserve indentation)
// 7. Strip MEDIA: paths from LLM output (media delivered separately)
content = stripMediaPaths(content)
// 8. Strip leading blank lines (preserve indentation)
content = stripLeadingBlankLines(content)
content = strings.TrimSpace(content)
@@ -277,7 +280,28 @@ func collapseConsecutiveDuplicateBlocks(content string) string {
return collapsed
}
// --- 7. Strip leading blank lines ---
// --- 7. Strip MEDIA: paths ---
// stripMediaPaths removes lines containing MEDIA:/path references from LLM output.
// These are tool result artifacts that should not appear in user-facing text
// (media files are delivered separately via OutboundMessage.Media).
func stripMediaPaths(content string) string {
if !strings.Contains(content, "MEDIA:") {
return content
}
lines := strings.Split(content, "\n")
var result []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "MEDIA:") || strings.HasPrefix(trimmed, "[[audio_as_voice]]") {
continue
}
result = append(result, line)
}
return strings.TrimSpace(strings.Join(result, "\n"))
}
// --- 8. Strip leading blank lines ---
var leadingBlankLinesPattern = regexp.MustCompile(`^(?:[ \t]*\r?\n)+`)
+11
View File
@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"log/slog"
"os"
"sync"
"github.com/nextlevelbuilder/goclaw/internal/bus"
@@ -130,6 +131,16 @@ func (m *Manager) dispatchOutbound(ctx context.Context) {
"error", err,
)
}
// Clean up temporary media files after successful (or failed) send.
// Files are created by tools (create_image, tts) and only needed for the send.
for _, media := range msg.Media {
if media.URL != "" {
if err := os.Remove(media.URL); err != nil {
slog.Debug("failed to clean up media file", "path", media.URL, "error", err)
}
}
}
}
}
}
+30 -11
View File
@@ -358,30 +358,49 @@ func (c *Channel) handleMessage(ctx context.Context, update telego.Update) {
}
// detectMention checks if a Telegram message mentions the bot.
// Checks both msg.Text/Entities (text messages) and msg.Caption/CaptionEntities (photo/media messages).
func (c *Channel) detectMention(msg *telego.Message, botUsername string) bool {
if botUsername == "" {
return false
}
lowerBot := strings.ToLower(botUsername)
for _, entity := range msg.Entities {
if entity.Type == "mention" && msg.Text != "" {
mentioned := msg.Text[entity.Offset : entity.Offset+entity.Length]
if strings.EqualFold(mentioned, "@"+botUsername) {
return true
}
// Check both text entities and caption entities (photos use Caption, not Text).
for _, pair := range []struct {
entities []telego.MessageEntity
text string
}{
{msg.Entities, msg.Text},
{msg.CaptionEntities, msg.Caption},
} {
if pair.text == "" {
continue
}
if entity.Type == "bot_command" && msg.Text != "" {
cmdText := msg.Text[entity.Offset : entity.Offset+entity.Length]
if strings.Contains(strings.ToLower(cmdText), "@"+strings.ToLower(botUsername)) {
return true
for _, entity := range pair.entities {
if entity.Type == "mention" {
mentioned := pair.text[entity.Offset : entity.Offset+entity.Length]
if strings.EqualFold(mentioned, "@"+botUsername) {
return true
}
}
if entity.Type == "bot_command" {
cmdText := pair.text[entity.Offset : entity.Offset+entity.Length]
if strings.Contains(strings.ToLower(cmdText), "@"+lowerBot) {
return true
}
}
}
}
if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+strings.ToLower(botUsername)) {
// Fallback: substring check in both text and caption
if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+lowerBot) {
return true
}
if msg.Caption != "" && strings.Contains(strings.ToLower(msg.Caption), "@"+lowerBot) {
return true
}
// Reply to bot's message = implicit mention
if msg.ReplyToMessage != nil && msg.ReplyToMessage.From != nil {
if msg.ReplyToMessage.From.Username == botUsername {
return true
+16
View File
@@ -186,6 +186,22 @@ type ToolPolicySpec struct {
Deny []string `json:"deny,omitempty"`
AlsoAllow []string `json:"alsoAllow,omitempty"`
ByProvider map[string]*ToolPolicySpec `json:"byProvider,omitempty"`
Vision *VisionConfig `json:"vision,omitempty"` // per-agent vision provider/model override
ImageGen *ImageGenConfig `json:"imageGen,omitempty"` // per-agent image generation config
}
// VisionConfig configures the provider and model for vision tools (read_image).
type VisionConfig struct {
Provider string `json:"provider,omitempty"` // e.g. "gemini", "anthropic"
Model string `json:"model,omitempty"` // e.g. "gemini-2.0-flash"
}
// ImageGenConfig configures the provider and model for image generation (create_image).
type ImageGenConfig struct {
Provider string `json:"provider,omitempty"` // provider with image gen API (e.g. "openrouter")
Model string `json:"model,omitempty"` // e.g. "google/gemini-2.5-flash-image-preview"
Size string `json:"size,omitempty"` // default aspect ratio / size
Quality string `json:"quality,omitempty"` // "standard" or "hd"
}
type WebToolsConfig struct {
+28 -4
View File
@@ -220,10 +220,34 @@ func (p *AnthropicProvider) buildRequestBody(model string, req ChatRequest, stre
})
case "user":
messages = append(messages, map[string]interface{}{
"role": "user",
"content": msg.Content,
})
if len(msg.Images) > 0 {
var blocks []map[string]interface{}
for _, img := range msg.Images {
blocks = append(blocks, map[string]interface{}{
"type": "image",
"source": map[string]interface{}{
"type": "base64",
"media_type": img.MimeType,
"data": img.Data,
},
})
}
if msg.Content != "" {
blocks = append(blocks, map[string]interface{}{
"type": "text",
"text": msg.Content,
})
}
messages = append(messages, map[string]interface{}{
"role": "user",
"content": blocks,
})
} else {
messages = append(messages, map[string]interface{}{
"role": "user",
"content": msg.Content,
})
}
case "assistant":
var blocks []map[string]interface{}
+29 -2
View File
@@ -18,6 +18,7 @@ type OpenAIProvider struct {
name string
apiKey string
apiBase string
chatPath string // defaults to "/chat/completions"
defaultModel string
client *http.Client
retryConfig RetryConfig
@@ -33,14 +34,23 @@ func NewOpenAIProvider(name, apiKey, apiBase, defaultModel string) *OpenAIProvid
name: name,
apiKey: apiKey,
apiBase: apiBase,
chatPath: "/chat/completions",
defaultModel: defaultModel,
client: &http.Client{Timeout: 120 * time.Second},
retryConfig: DefaultRetryConfig(),
}
}
// WithChatPath returns a copy with a custom chat completions path (e.g. "/text/chatcompletion_v2" for MiniMax native API).
func (p *OpenAIProvider) WithChatPath(path string) *OpenAIProvider {
p.chatPath = path
return p
}
func (p *OpenAIProvider) Name() string { return p.name }
func (p *OpenAIProvider) DefaultModel() string { return p.defaultModel }
func (p *OpenAIProvider) APIKey() string { return p.apiKey }
func (p *OpenAIProvider) APIBase() string { return p.apiBase }
// resolveModel returns the model ID to use for a request.
// For OpenRouter, model IDs require a provider prefix (e.g. "anthropic/claude-sonnet-4-5-20250929").
@@ -184,7 +194,24 @@ func (p *OpenAIProvider) buildRequestBody(model string, req ChatRequest, stream
// Include content; omit empty content for assistant messages with tool_calls
// (Gemini rejects empty content → "must include at least one parts field").
if m.Content != "" || len(m.ToolCalls) == 0 {
if m.Role == "user" && len(m.Images) > 0 {
var parts []map[string]interface{}
for _, img := range m.Images {
parts = append(parts, map[string]interface{}{
"type": "image_url",
"image_url": map[string]interface{}{
"url": fmt.Sprintf("data:%s;base64,%s", img.MimeType, img.Data),
},
})
}
if m.Content != "" {
parts = append(parts, map[string]interface{}{
"type": "text",
"text": m.Content,
})
}
msg["content"] = parts
} else if m.Content != "" || len(m.ToolCalls) == 0 {
msg["content"] = m.Content
}
@@ -247,7 +274,7 @@ func (p *OpenAIProvider) doRequest(ctx context.Context, body interface{}) (io.Re
return nil, fmt.Errorf("%s: marshal request: %w", p.name, err)
}
httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+"/chat/completions", bytes.NewReader(data))
httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+p.chatPath, bytes.NewReader(data))
if err != nil {
return nil, fmt.Errorf("%s: create request: %w", p.name, err)
}
+11 -4
View File
@@ -42,12 +42,19 @@ type StreamChunk struct {
Done bool `json:"done,omitempty"`
}
// ImageContent represents a base64-encoded image for vision-capable models.
type ImageContent struct {
MimeType string `json:"mime_type"` // e.g. "image/jpeg"
Data string `json:"data"` // base64-encoded image bytes
}
// Message represents a conversation message.
type Message struct {
Role string `json:"role"` // "system", "user", "assistant", "tool"
Content string `json:"content"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"` // for role="tool" responses
Role string `json:"role"` // "system", "user", "assistant", "tool"
Content string `json:"content"`
Images []ImageContent `json:"images,omitempty"` // vision: base64 images
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"` // for role="tool" responses
}
// ToolCall represents a tool invocation requested by the LLM.
+30 -1
View File
@@ -1,6 +1,10 @@
package tools
import "context"
import (
"context"
"github.com/nextlevelbuilder/goclaw/internal/config"
)
// Tool execution context keys.
// These replace mutable setter fields on tool instances, making tools thread-safe
@@ -71,3 +75,28 @@ func ToolWorkspaceFromCtx(ctx context.Context) string {
v, _ := ctx.Value(ctxWorkspace).(string)
return v
}
// --- Vision / ImageGen config (per-agent overrides) ---
const (
ctxVisionConfig toolContextKey = "tool_vision_config"
ctxImageGenConfig toolContextKey = "tool_imagegen_config"
)
func WithVisionConfig(ctx context.Context, cfg *config.VisionConfig) context.Context {
return context.WithValue(ctx, ctxVisionConfig, cfg)
}
func VisionConfigFromCtx(ctx context.Context) *config.VisionConfig {
v, _ := ctx.Value(ctxVisionConfig).(*config.VisionConfig)
return v
}
func WithImageGenConfig(ctx context.Context, cfg *config.ImageGenConfig) context.Context {
return context.WithValue(ctx, ctxImageGenConfig, cfg)
}
func ImageGenConfigFromCtx(ctx context.Context) *config.ImageGenConfig {
v, _ := ctx.Value(ctxImageGenConfig).(*config.ImageGenConfig)
return v
}
+288
View File
@@ -0,0 +1,288 @@
package tools
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/nextlevelbuilder/goclaw/internal/providers"
)
// credentialProvider is a narrow interface for providers that expose API credentials.
type credentialProvider interface {
APIKey() string
APIBase() string
}
// imageGenProviderPriority is the default order for image generation providers.
var imageGenProviderPriority = []string{"openrouter", "gemini", "openai"}
// imageGenModelDefaults maps provider names to default image generation models.
var imageGenModelDefaults = map[string]string{
"openrouter": "google/gemini-2.5-flash-image",
"openai": "dall-e-3",
"gemini": "gemini-2.0-flash-exp",
}
// CreateImageTool generates images using an image generation API.
// Uses OpenRouter (Gemini image model) or OpenAI (DALL-E) via per-agent ImageGenConfig.
type CreateImageTool struct {
registry *providers.Registry
}
func NewCreateImageTool(registry *providers.Registry) *CreateImageTool {
return &CreateImageTool{registry: registry}
}
func (t *CreateImageTool) Name() string { return "create_image" }
func (t *CreateImageTool) Description() string {
return "Generate an image from a text description using an image generation model. Returns a MEDIA: path to the generated image file."
}
func (t *CreateImageTool) Parameters() map[string]interface{} {
return map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"prompt": map[string]interface{}{
"type": "string",
"description": "Text description of the image to generate.",
},
"aspect_ratio": map[string]interface{}{
"type": "string",
"description": "Aspect ratio: '1:1' (default), '3:4', '4:3', '9:16', '16:9'.",
},
},
"required": []string{"prompt"},
}
}
func (t *CreateImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
prompt, _ := args["prompt"].(string)
if prompt == "" {
return ErrorResult("prompt is required")
}
aspectRatio, _ := args["aspect_ratio"].(string)
if aspectRatio == "" {
aspectRatio = "1:1"
}
// Resolve provider from per-agent config or defaults
providerName, model := t.resolveConfig(ctx)
p, err := t.registry.Get(providerName)
if err != nil {
return ErrorResult(fmt.Sprintf("image generation provider %q not available", providerName))
}
cp, ok := p.(credentialProvider)
if !ok {
return ErrorResult(fmt.Sprintf("provider %q does not expose API credentials for image generation", providerName))
}
slog.Info("create_image: calling image generation API",
"provider", providerName, "model", model, "aspect_ratio", aspectRatio)
imageBytes, usage, err := t.callImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, aspectRatio)
if err != nil {
return ErrorResult(fmt.Sprintf("image generation failed: %v", err))
}
// Save to temp file
imagePath := filepath.Join(os.TempDir(), fmt.Sprintf("goclaw_gen_%d.png", time.Now().UnixNano()))
if err := os.WriteFile(imagePath, imageBytes, 0644); err != nil {
return ErrorResult(fmt.Sprintf("failed to save generated image: %v", err))
}
result := &Result{ForLLM: fmt.Sprintf("MEDIA:%s", imagePath)}
result.Provider = providerName
result.Model = model
if usage != nil {
result.Usage = usage
}
return result
}
// resolveConfig returns the provider name and model to use for image generation.
func (t *CreateImageTool) resolveConfig(ctx context.Context) (providerName, model string) {
// 1. Check per-agent ImageGenConfig from context
if cfg := ImageGenConfigFromCtx(ctx); cfg != nil {
if cfg.Provider != "" {
providerName = cfg.Provider
}
if cfg.Model != "" {
model = cfg.Model
}
}
// 2. If provider not set, find first available from priority list
if providerName == "" {
for _, name := range imageGenProviderPriority {
if _, err := t.registry.Get(name); err == nil {
providerName = name
break
}
}
}
if providerName == "" {
providerName = "openrouter" // fallback even if unavailable (error handled later)
}
// 3. If model not set, use default for this provider
if model == "" {
if m, ok := imageGenModelDefaults[providerName]; ok {
model = m
}
}
return providerName, model
}
// callImageGenAPI calls the OpenAI-compatible image generation endpoint.
// Works with OpenRouter (modalities: ["image","text"]) and OpenAI (/images/generations).
func (t *CreateImageTool) callImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt, aspectRatio string) ([]byte, *providers.Usage, error) {
// OpenRouter / OpenAI-compat: use chat completions with modalities
body := map[string]interface{}{
"model": model,
"messages": []map[string]interface{}{
{"role": "user", "content": prompt},
},
"modalities": []string{"image", "text"},
}
if aspectRatio != "" && aspectRatio != "1:1" {
body["image_config"] = map[string]interface{}{
"aspect_ratio": aspectRatio,
}
}
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, nil, fmt.Errorf("marshal request: %w", err)
}
url := strings.TrimRight(apiBase, "/") + "/chat/completions"
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil, nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, nil, fmt.Errorf("http request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, nil, fmt.Errorf("read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
}
return t.parseImageResponse(respBody)
}
// parseImageResponse extracts base64 image data from the OpenAI-compat chat response.
// Looks for images in choices[0].message.content (multipart) or choices[0].message.images.
func (t *CreateImageTool) parseImageResponse(respBody []byte) ([]byte, *providers.Usage, error) {
var resp struct {
Choices []struct {
Message struct {
Content interface{} `json:"content"`
Images []struct {
ImageURL struct {
URL string `json:"url"`
} `json:"image_url"`
} `json:"images"`
} `json:"message"`
} `json:"choices"`
Usage *struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
} `json:"usage"`
}
if err := json.Unmarshal(respBody, &resp); err != nil {
return nil, nil, fmt.Errorf("parse response: %w", err)
}
if len(resp.Choices) == 0 {
return nil, nil, fmt.Errorf("no choices in response")
}
msg := resp.Choices[0].Message
// Try images array first (OpenRouter format)
for _, img := range msg.Images {
if imageBytes, err := decodeDataURL(img.ImageURL.URL); err == nil {
return imageBytes, convertUsage(resp.Usage), nil
}
}
// Try multipart content array (some providers return content as array of parts)
if parts, ok := msg.Content.([]interface{}); ok {
for _, part := range parts {
if m, ok := part.(map[string]interface{}); ok {
if m["type"] == "image_url" {
if imgURL, ok := m["image_url"].(map[string]interface{}); ok {
if url, ok := imgURL["url"].(string); ok {
if imageBytes, err := decodeDataURL(url); err == nil {
return imageBytes, convertUsage(resp.Usage), nil
}
}
}
}
}
}
}
return nil, nil, fmt.Errorf("no image data found in response")
}
// decodeDataURL decodes a data:image/...;base64,... URL into raw bytes.
func decodeDataURL(dataURL string) ([]byte, error) {
// Format: data:image/png;base64,iVBORw0KGgo...
idx := strings.Index(dataURL, ";base64,")
if idx < 0 {
return nil, fmt.Errorf("not a base64 data URL")
}
b64 := dataURL[idx+8:]
return base64.StdEncoding.DecodeString(b64)
}
func convertUsage(u *struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
}) *providers.Usage {
if u == nil {
return nil
}
return &providers.Usage{
PromptTokens: u.PromptTokens,
CompletionTokens: u.CompletionTokens,
TotalTokens: u.TotalTokens,
}
}
func truncateBytes(b []byte, max int) string {
if len(b) <= max {
return string(b)
}
return string(b[:max]) + "..."
}
+2 -2
View File
@@ -25,7 +25,7 @@ var toolGroups = map[string][]string{
"browser", "canvas", "nodes", "cron", "message", "gateway",
"agents_list", "sessions_list", "sessions_history", "sessions_send",
"sessions_spawn", "subagents", "session_status",
"memory_search", "memory_get", "web_search", "web_fetch", "image",
"memory_search", "memory_get", "web_search", "web_fetch", "read_image", "create_image",
},
}
@@ -49,7 +49,7 @@ func UnregisterToolGroup(name string) {
// Tool profiles define preset allow sets.
var toolProfiles = map[string][]string{
"minimal": {"session_status"},
"coding": {"group:fs", "group:runtime", "group:sessions", "group:memory", "image"},
"coding": {"group:fs", "group:runtime", "group:sessions", "group:memory", "read_image", "create_image"},
"messaging": {"group:messaging", "sessions_list", "sessions_history", "sessions_send", "session_status"},
"full": {}, // empty = no restrictions
}
+139
View File
@@ -0,0 +1,139 @@
package tools
import (
"context"
"fmt"
"log/slog"
"github.com/nextlevelbuilder/goclaw/internal/providers"
)
// --- Context helpers for media images ---
const ctxMediaImages toolContextKey = "tool_media_images"
// WithMediaImages stores base64-encoded images in context for read_image tool access.
func WithMediaImages(ctx context.Context, images []providers.ImageContent) context.Context {
return context.WithValue(ctx, ctxMediaImages, images)
}
// MediaImagesFromCtx retrieves stored images from context.
func MediaImagesFromCtx(ctx context.Context) []providers.ImageContent {
v, _ := ctx.Value(ctxMediaImages).([]providers.ImageContent)
return v
}
// --- ReadImageTool ---
// visionProviderPriority is the order in which providers are tried for vision.
var visionProviderPriority = []string{"gemini", "anthropic", "openrouter"}
// visionModelOverrides maps provider names to preferred vision models.
// Providers not listed here use their default model.
var visionModelOverrides = map[string]string{
"openrouter": "google/gemini-2.0-flash-001",
}
// ReadImageTool uses a vision-capable provider to describe images attached to the current message.
type ReadImageTool struct {
registry *providers.Registry
}
func NewReadImageTool(registry *providers.Registry) *ReadImageTool {
return &ReadImageTool{registry: registry}
}
func (t *ReadImageTool) Name() string { return "read_image" }
func (t *ReadImageTool) Description() string {
return "Analyze images attached to the current message using a vision model. Use this when you see <media:image> tags but cannot view images directly."
}
func (t *ReadImageTool) Parameters() map[string]interface{} {
return map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"prompt": map[string]interface{}{
"type": "string",
"description": "What you want to know about the image(s). E.g. 'Describe this image in detail' or 'What text is in this image?'",
},
},
"required": []string{"prompt"},
}
}
func (t *ReadImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
prompt, _ := args["prompt"].(string)
if prompt == "" {
prompt = "Describe this image in detail."
}
images := MediaImagesFromCtx(ctx)
if len(images) == 0 {
return ErrorResult("No images available in this conversation. The user may not have sent an image.")
}
// Find a vision-capable provider (per-agent config > hardcoded priority)
provider, model, err := t.resolveVisionProviderWithConfig(ctx)
if err != nil {
return ErrorResult(err.Error())
}
slog.Info("read_image: calling vision provider", "provider", provider.Name(), "model", model, "images", len(images))
resp, err := provider.Chat(ctx, providers.ChatRequest{
Messages: []providers.Message{
{
Role: "user",
Content: prompt,
Images: images,
},
},
Model: model,
Options: map[string]interface{}{
"max_tokens": 1024,
"temperature": 0.3,
},
})
if err != nil {
return ErrorResult(fmt.Sprintf("Vision provider error: %v", err))
}
result := NewResult(resp.Content)
result.Usage = resp.Usage
result.Provider = provider.Name()
result.Model = model
return result
}
// resolveVisionProviderWithConfig checks per-agent VisionConfig first, then falls back to hardcoded priority.
func (t *ReadImageTool) resolveVisionProviderWithConfig(ctx context.Context) (providers.Provider, string, error) {
if cfg := VisionConfigFromCtx(ctx); cfg != nil && cfg.Provider != "" {
p, err := t.registry.Get(cfg.Provider)
if err != nil {
return nil, "", fmt.Errorf("configured vision provider %q not available: %w", cfg.Provider, err)
}
model := cfg.Model
if model == "" {
model = p.DefaultModel()
}
return p, model, nil
}
return t.resolveVisionProvider()
}
// resolveVisionProvider finds the first available vision-capable provider.
func (t *ReadImageTool) resolveVisionProvider() (providers.Provider, string, error) {
for _, name := range visionProviderPriority {
p, err := t.registry.Get(name)
if err != nil {
continue
}
model := p.DefaultModel()
if override, ok := visionModelOverrides[name]; ok {
model = override
}
return p, model, nil
}
return nil, "", fmt.Errorf("no vision-capable provider available (need one of: %v)", visionProviderPriority)
}
+8
View File
@@ -1,5 +1,7 @@
package tools
import "github.com/nextlevelbuilder/goclaw/internal/providers"
// Result is the unified return type from tool execution.
type Result struct {
ForLLM string `json:"for_llm"` // content sent to the LLM
@@ -8,6 +10,12 @@ type Result struct {
IsError bool `json:"is_error"` // marks error
Async bool `json:"async"` // running asynchronously
Err error `json:"-"` // internal error (not serialized)
// Usage holds token usage from tools that make internal LLM calls (e.g. read_image).
// When set, the agent loop records these on the tool span for tracing.
Usage *providers.Usage `json:"-"`
Provider string `json:"-"` // provider name (for tool span metadata)
Model string `json:"-"` // model used (for tool span metadata)
}
func NewResult(forLLM string) *Result {
@@ -255,7 +255,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
{span.input_preview && (
<div>
<p className="text-xs text-muted-foreground">Input:</p>
<pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
<pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
{span.input_preview}
</pre>
</div>
@@ -263,7 +263,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
{span.output_preview && (
<div>
<p className="text-xs text-muted-foreground">Output:</p>
<pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
<pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
{span.output_preview}
</pre>
</div>