mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-11 14:11:29 +00:00
feat: Implement vision capabilities and image generation tools, adding media handling, dedicated configurations, and trace optimization for image data.
This commit is contained in:
@@ -182,6 +182,10 @@ func runGateway() {
|
||||
toolsReg.Register(webFetchTool)
|
||||
slog.Info("web_fetch tool enabled")
|
||||
|
||||
// Vision fallback tool (for non-vision providers like MiniMax)
|
||||
toolsReg.Register(tools.NewReadImageTool(providerRegistry))
|
||||
toolsReg.Register(tools.NewCreateImageTool(providerRegistry))
|
||||
|
||||
// TTS (text-to-speech) system
|
||||
ttsMgr := setupTTS(cfg)
|
||||
if ttsMgr != nil {
|
||||
|
||||
+19
-2
@@ -150,6 +150,7 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
|
||||
outCh := sched.ScheduleWithOpts(ctx, "main", agent.RunRequest{
|
||||
SessionKey: sessionKey,
|
||||
Message: msg.Content,
|
||||
Media: msg.Media,
|
||||
Channel: msg.Channel,
|
||||
ChatID: msg.ChatID,
|
||||
PeerKind: peerKind,
|
||||
@@ -225,12 +226,28 @@ func consumeInboundMessages(ctx context.Context, msgBus *bus.MessageBus, agents
|
||||
}
|
||||
|
||||
// Publish response back to the channel
|
||||
msgBus.PublishOutbound(bus.OutboundMessage{
|
||||
outMsg := bus.OutboundMessage{
|
||||
Channel: channel,
|
||||
ChatID: chatID,
|
||||
Content: outcome.Result.Content,
|
||||
Metadata: meta,
|
||||
})
|
||||
}
|
||||
|
||||
// Convert media results from agent run to outbound media attachments
|
||||
for _, mr := range outcome.Result.Media {
|
||||
outMsg.Media = append(outMsg.Media, bus.MediaAttachment{
|
||||
URL: mr.Path,
|
||||
ContentType: mr.ContentType,
|
||||
})
|
||||
if mr.AsVoice {
|
||||
if outMsg.Metadata == nil {
|
||||
outMsg.Metadata = make(map[string]string)
|
||||
}
|
||||
outMsg.Metadata["audio_as_voice"] = "true"
|
||||
}
|
||||
}
|
||||
|
||||
msgBus.PublishOutbound(outMsg)
|
||||
}(msg.Channel, msg.ChatID, sessionKey, runID, outMeta)
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/config"
|
||||
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
@@ -51,7 +52,8 @@ func registerProviders(registry *providers.Registry, cfg *config.Config) {
|
||||
}
|
||||
|
||||
if cfg.Providers.MiniMax.APIKey != "" {
|
||||
registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5"))
|
||||
registry.Register(providers.NewOpenAIProvider("minimax", cfg.Providers.MiniMax.APIKey, "https://api.minimax.io/v1", "MiniMax-M2.5").
|
||||
WithChatPath("/text/chatcompletion_v2"))
|
||||
slog.Info("registered provider", "name", "minimax")
|
||||
}
|
||||
|
||||
@@ -82,7 +84,12 @@ func registerProvidersFromDB(registry *providers.Registry, provStore store.Provi
|
||||
if p.ProviderType == "anthropic_native" {
|
||||
registry.Register(providers.NewAnthropicProvider(p.APIKey))
|
||||
} else {
|
||||
registry.Register(providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, ""))
|
||||
prov := providers.NewOpenAIProvider(p.Name, p.APIKey, p.APIBase, "")
|
||||
// MiniMax native API uses a different chat path for vision support.
|
||||
if p.Name == "minimax" && strings.Contains(p.APIBase, "minimax.io") {
|
||||
prov.WithChatPath("/text/chatcompletion_v2")
|
||||
}
|
||||
registry.Register(prov)
|
||||
}
|
||||
slog.Info("registered provider from DB", "name", p.Name)
|
||||
}
|
||||
|
||||
+114
-8
@@ -220,6 +220,7 @@ func NewLoop(cfg LoopConfig) *Loop {
|
||||
type RunRequest struct {
|
||||
SessionKey string // composite key: agent:{agentId}:{channel}:{peerKind}:{chatId}
|
||||
Message string // user message
|
||||
Media []string // local file paths to images (already sanitized)
|
||||
Channel string // source channel
|
||||
ChatID string // source chat ID
|
||||
PeerKind string // "direct" or "group" (for session key building and tool context)
|
||||
@@ -235,10 +236,18 @@ type RunRequest struct {
|
||||
|
||||
// RunResult is the output of a completed agent run.
|
||||
type RunResult struct {
|
||||
Content string `json:"content"`
|
||||
RunID string `json:"runId"`
|
||||
Iterations int `json:"iterations"`
|
||||
Content string `json:"content"`
|
||||
RunID string `json:"runId"`
|
||||
Iterations int `json:"iterations"`
|
||||
Usage *providers.Usage `json:"usage,omitempty"`
|
||||
Media []MediaResult `json:"media,omitempty"` // media files from tool results (MEDIA: prefix)
|
||||
}
|
||||
|
||||
// MediaResult represents a media file produced by a tool during the agent run.
|
||||
type MediaResult struct {
|
||||
Path string `json:"path"` // local file path
|
||||
ContentType string `json:"content_type,omitempty"` // MIME type
|
||||
AsVoice bool `json:"as_voice,omitempty"` // send as voice message (Telegram OGG)
|
||||
}
|
||||
|
||||
// Run processes a single message through the agent loop.
|
||||
@@ -351,6 +360,15 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
if req.SenderID != "" {
|
||||
ctx = store.WithSenderID(ctx, req.SenderID)
|
||||
}
|
||||
// Inject per-agent vision/imagegen config for read_image/create_image tools
|
||||
if l.agentToolPolicy != nil {
|
||||
if l.agentToolPolicy.Vision != nil {
|
||||
ctx = tools.WithVisionConfig(ctx, l.agentToolPolicy.Vision)
|
||||
}
|
||||
if l.agentToolPolicy.ImageGen != nil {
|
||||
ctx = tools.WithImageGenConfig(ctx, l.agentToolPolicy.ImageGen)
|
||||
}
|
||||
}
|
||||
|
||||
// Per-user workspace isolation.
|
||||
// Each user gets a subdirectory within the agent's workspace.
|
||||
@@ -430,19 +448,37 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
// (hadBootstrap) — no extra DB roundtrip needed for bootstrap detection.
|
||||
messages, hadBootstrap := l.buildMessages(ctx, history, summary, req.Message, req.ExtraSystemPrompt, req.SessionKey, req.Channel, req.UserID, req.HistoryLimit)
|
||||
|
||||
// 2. Buffer new messages — write to session only AFTER the run completes.
|
||||
// 2. Attach vision images to the current user message (last in messages slice).
|
||||
// Images are only attached to the live request, NOT persisted in session history.
|
||||
if len(req.Media) > 0 {
|
||||
if images := loadImages(req.Media); len(images) > 0 {
|
||||
messages[len(messages)-1].Images = images
|
||||
ctx = tools.WithMediaImages(ctx, images) // make images available to read_image tool
|
||||
slog.Info("vision: attached images to user message", "count", len(images), "agent", l.id, "session", req.SessionKey)
|
||||
}
|
||||
// Clean up temp media files — they're now base64-encoded in memory.
|
||||
for _, p := range req.Media {
|
||||
if err := os.Remove(p); err != nil {
|
||||
slog.Debug("vision: failed to clean temp media file", "path", p, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Buffer new messages — write to session only AFTER the run completes.
|
||||
// This prevents concurrent runs from seeing each other's in-progress messages.
|
||||
// NOTE: pendingMsgs stores TEXT ONLY (no images) to avoid bloating session storage.
|
||||
var pendingMsgs []providers.Message
|
||||
pendingMsgs = append(pendingMsgs, providers.Message{
|
||||
Role: "user",
|
||||
Content: req.Message,
|
||||
})
|
||||
|
||||
// 3. Run LLM iteration loop
|
||||
// 4. Run LLM iteration loop
|
||||
var totalUsage providers.Usage
|
||||
iteration := 0
|
||||
var finalContent string
|
||||
var asyncToolCalls []string // track async spawn tool names for fallback
|
||||
var asyncToolCalls []string // track async spawn tool names for fallback
|
||||
var mediaResults []MediaResult // media files from tool MEDIA: results
|
||||
|
||||
for iteration < l.maxIterations {
|
||||
iteration++
|
||||
@@ -533,7 +569,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
toolSpanStart := time.Now().UTC()
|
||||
result := l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
|
||||
|
||||
l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result.ForLLM, result.IsError)
|
||||
l.emitToolSpan(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON), result)
|
||||
|
||||
if result.Async {
|
||||
asyncToolCalls = append(asyncToolCalls, tc.Name)
|
||||
@@ -558,6 +594,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
},
|
||||
})
|
||||
|
||||
// Collect MEDIA: paths from tool results
|
||||
if mr := parseMediaResult(result.ForLLM); mr != nil {
|
||||
mediaResults = append(mediaResults, *mr)
|
||||
}
|
||||
|
||||
toolMsg := providers.Message{
|
||||
Role: "tool",
|
||||
Content: result.ForLLM,
|
||||
@@ -619,7 +660,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
|
||||
// 5. Process results sequentially: emit events, append messages, save to session
|
||||
for _, r := range collected {
|
||||
l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result.ForLLM, r.result.IsError)
|
||||
l.emitToolSpan(ctx, r.spanStart, r.tc.Name, r.tc.ID, r.argsJSON, r.result)
|
||||
|
||||
if r.result.Async {
|
||||
asyncToolCalls = append(asyncToolCalls, r.tc.Name)
|
||||
@@ -644,6 +685,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
},
|
||||
})
|
||||
|
||||
// Collect MEDIA: paths from tool results
|
||||
if mr := parseMediaResult(r.result.ForLLM); mr != nil {
|
||||
mediaResults = append(mediaResults, *mr)
|
||||
}
|
||||
|
||||
toolMsg := providers.Message{
|
||||
Role: "tool",
|
||||
Content: r.result.ForLLM,
|
||||
@@ -723,9 +769,69 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
RunID: req.RunID,
|
||||
Iterations: iteration,
|
||||
Usage: &totalUsage,
|
||||
Media: mediaResults,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// parseMediaResult extracts a MediaResult from a tool result string containing "MEDIA:" prefix.
|
||||
// Handles formats: "MEDIA:/path/to/file" and "[[audio_as_voice]]\nMEDIA:/path/to/file".
|
||||
// Returns nil if no MEDIA: prefix is found.
|
||||
func parseMediaResult(toolOutput string) *MediaResult {
|
||||
s := toolOutput
|
||||
asVoice := false
|
||||
|
||||
// Check for [[audio_as_voice]] tag (TTS voice messages)
|
||||
if strings.Contains(s, "[[audio_as_voice]]") {
|
||||
asVoice = true
|
||||
s = strings.ReplaceAll(s, "[[audio_as_voice]]", "")
|
||||
s = strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
// Find MEDIA: prefix
|
||||
idx := strings.Index(s, "MEDIA:")
|
||||
if idx < 0 {
|
||||
return nil
|
||||
}
|
||||
path := strings.TrimSpace(s[idx+6:])
|
||||
if path == "" {
|
||||
return nil
|
||||
}
|
||||
// Take only the first line (in case there's trailing text)
|
||||
if nl := strings.IndexByte(path, '\n'); nl >= 0 {
|
||||
path = strings.TrimSpace(path[:nl])
|
||||
}
|
||||
|
||||
return &MediaResult{
|
||||
Path: path,
|
||||
ContentType: mimeFromExt(filepath.Ext(path)),
|
||||
AsVoice: asVoice,
|
||||
}
|
||||
}
|
||||
|
||||
// mimeFromExt returns a MIME type for common media file extensions.
|
||||
func mimeFromExt(ext string) string {
|
||||
switch strings.ToLower(ext) {
|
||||
case ".png":
|
||||
return "image/png"
|
||||
case ".jpg", ".jpeg":
|
||||
return "image/jpeg"
|
||||
case ".gif":
|
||||
return "image/gif"
|
||||
case ".webp":
|
||||
return "image/webp"
|
||||
case ".mp4":
|
||||
return "video/mp4"
|
||||
case ".ogg", ".opus":
|
||||
return "audio/ogg"
|
||||
case ".mp3":
|
||||
return "audio/mpeg"
|
||||
case ".wav":
|
||||
return "audio/wav"
|
||||
default:
|
||||
return "application/octet-stream"
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizePathSegment makes a userID safe for use as a directory name.
|
||||
// Replaces colons, spaces, and other unsafe chars with underscores.
|
||||
func sanitizePathSegment(s string) string {
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
"github.com/nextlevelbuilder/goclaw/internal/store"
|
||||
"github.com/nextlevelbuilder/goclaw/internal/tools"
|
||||
"github.com/nextlevelbuilder/goclaw/internal/tracing"
|
||||
)
|
||||
|
||||
@@ -61,10 +62,22 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
|
||||
span.AgentID = &l.agentUUID
|
||||
}
|
||||
|
||||
// Verbose mode: serialize full messages and output
|
||||
// Verbose mode: serialize full messages and output.
|
||||
// Strip base64 image data to avoid bloating traces and PostgreSQL encoding issues.
|
||||
verbose := collector.Verbose()
|
||||
if verbose && len(messages) > 0 {
|
||||
if b, err := json.Marshal(messages); err == nil {
|
||||
stripped := make([]providers.Message, len(messages))
|
||||
copy(stripped, messages)
|
||||
for i := range stripped {
|
||||
if len(stripped[i].Images) > 0 {
|
||||
placeholder := make([]providers.ImageContent, len(stripped[i].Images))
|
||||
for j, img := range stripped[i].Images {
|
||||
placeholder[j] = providers.ImageContent{MimeType: img.MimeType, Data: fmt.Sprintf("[base64 %s, %d bytes]", img.MimeType, len(img.Data))}
|
||||
}
|
||||
stripped[i].Images = placeholder
|
||||
}
|
||||
}
|
||||
if b, err := json.Marshal(stripped); err == nil {
|
||||
span.InputPreview = truncateStr(string(b), 100000)
|
||||
}
|
||||
}
|
||||
@@ -98,7 +111,8 @@ func (l *Loop) emitLLMSpan(ctx context.Context, start time.Time, iteration int,
|
||||
}
|
||||
|
||||
// emitToolSpan records a tool call span if tracing is active.
|
||||
func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input, output string, isError bool) {
|
||||
// result is the full tool execution result, which may contain Usage from inner LLM calls.
|
||||
func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, toolCallID, input string, result *tools.Result) {
|
||||
traceID := tracing.TraceIDFromContext(ctx)
|
||||
collector := tracing.CollectorFromContext(ctx)
|
||||
if collector == nil || traceID == uuid.Nil {
|
||||
@@ -121,7 +135,7 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
|
||||
ToolName: toolName,
|
||||
ToolCallID: toolCallID,
|
||||
InputPreview: truncateStr(input, previewLimit),
|
||||
OutputPreview: truncateStr(output, previewLimit),
|
||||
OutputPreview: truncateStr(result.ForLLM, previewLimit),
|
||||
Status: store.SpanStatusCompleted,
|
||||
Level: "DEFAULT",
|
||||
CreatedAt: now,
|
||||
@@ -132,9 +146,26 @@ func (l *Loop) emitToolSpan(ctx context.Context, start time.Time, toolName, tool
|
||||
if l.agentUUID != uuid.Nil {
|
||||
span.AgentID = &l.agentUUID
|
||||
}
|
||||
if isError {
|
||||
if result.IsError {
|
||||
span.Status = store.SpanStatusError
|
||||
span.Error = truncateStr(output, 200)
|
||||
span.Error = truncateStr(result.ForLLM, 200)
|
||||
}
|
||||
|
||||
// Record token usage from tools that make internal LLM calls (e.g. read_image).
|
||||
if result.Usage != nil {
|
||||
span.InputTokens = result.Usage.PromptTokens
|
||||
span.OutputTokens = result.Usage.CompletionTokens
|
||||
span.Provider = result.Provider
|
||||
span.Model = result.Model
|
||||
if result.Usage.CacheCreationTokens > 0 || result.Usage.CacheReadTokens > 0 {
|
||||
meta := map[string]int{
|
||||
"cache_creation_tokens": result.Usage.CacheCreationTokens,
|
||||
"cache_read_tokens": result.Usage.CacheReadTokens,
|
||||
}
|
||||
if b, err := json.Marshal(meta); err == nil {
|
||||
span.Metadata = b
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
collector.EmitSpan(span)
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
)
|
||||
|
||||
// maxImageBytes is the safety limit for reading image files (10MB).
|
||||
const maxImageBytes = 10 * 1024 * 1024
|
||||
|
||||
// loadImages reads local image files and returns base64-encoded ImageContent slices.
|
||||
// Non-image files and files that fail to read are skipped with a warning log.
|
||||
func loadImages(paths []string) []providers.ImageContent {
|
||||
if len(paths) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var images []providers.ImageContent
|
||||
for _, p := range paths {
|
||||
mime := inferImageMime(p)
|
||||
if mime == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(p)
|
||||
if err != nil {
|
||||
slog.Warn("vision: failed to read image file", "path", p, "error", err)
|
||||
continue
|
||||
}
|
||||
if len(data) > maxImageBytes {
|
||||
slog.Warn("vision: image file too large, skipping", "path", p, "size", len(data))
|
||||
continue
|
||||
}
|
||||
|
||||
images = append(images, providers.ImageContent{
|
||||
MimeType: mime,
|
||||
Data: base64.StdEncoding.EncodeToString(data),
|
||||
})
|
||||
}
|
||||
return images
|
||||
}
|
||||
|
||||
// inferImageMime returns the MIME type for supported image extensions, or "" if not an image.
|
||||
func inferImageMime(path string) string {
|
||||
switch strings.ToLower(filepath.Ext(path)) {
|
||||
case ".jpg", ".jpeg":
|
||||
return "image/jpeg"
|
||||
case ".png":
|
||||
return "image/png"
|
||||
case ".gif":
|
||||
return "image/gif"
|
||||
case ".webp":
|
||||
return "image/webp"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
@@ -53,7 +53,10 @@ func SanitizeAssistantContent(content string) string {
|
||||
// 6. Collapse consecutive duplicate blocks
|
||||
content = collapseConsecutiveDuplicateBlocks(content)
|
||||
|
||||
// 7. Strip leading blank lines (preserve indentation)
|
||||
// 7. Strip MEDIA: paths from LLM output (media delivered separately)
|
||||
content = stripMediaPaths(content)
|
||||
|
||||
// 8. Strip leading blank lines (preserve indentation)
|
||||
content = stripLeadingBlankLines(content)
|
||||
|
||||
content = strings.TrimSpace(content)
|
||||
@@ -277,7 +280,28 @@ func collapseConsecutiveDuplicateBlocks(content string) string {
|
||||
return collapsed
|
||||
}
|
||||
|
||||
// --- 7. Strip leading blank lines ---
|
||||
// --- 7. Strip MEDIA: paths ---
|
||||
|
||||
// stripMediaPaths removes lines containing MEDIA:/path references from LLM output.
|
||||
// These are tool result artifacts that should not appear in user-facing text
|
||||
// (media files are delivered separately via OutboundMessage.Media).
|
||||
func stripMediaPaths(content string) string {
|
||||
if !strings.Contains(content, "MEDIA:") {
|
||||
return content
|
||||
}
|
||||
lines := strings.Split(content, "\n")
|
||||
var result []string
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "MEDIA:") || strings.HasPrefix(trimmed, "[[audio_as_voice]]") {
|
||||
continue
|
||||
}
|
||||
result = append(result, line)
|
||||
}
|
||||
return strings.TrimSpace(strings.Join(result, "\n"))
|
||||
}
|
||||
|
||||
// --- 8. Strip leading blank lines ---
|
||||
|
||||
var leadingBlankLinesPattern = regexp.MustCompile(`^(?:[ \t]*\r?\n)+`)
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/bus"
|
||||
@@ -130,6 +131,16 @@ func (m *Manager) dispatchOutbound(ctx context.Context) {
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
|
||||
// Clean up temporary media files after successful (or failed) send.
|
||||
// Files are created by tools (create_image, tts) and only needed for the send.
|
||||
for _, media := range msg.Media {
|
||||
if media.URL != "" {
|
||||
if err := os.Remove(media.URL); err != nil {
|
||||
slog.Debug("failed to clean up media file", "path", media.URL, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -358,30 +358,49 @@ func (c *Channel) handleMessage(ctx context.Context, update telego.Update) {
|
||||
}
|
||||
|
||||
// detectMention checks if a Telegram message mentions the bot.
|
||||
// Checks both msg.Text/Entities (text messages) and msg.Caption/CaptionEntities (photo/media messages).
|
||||
func (c *Channel) detectMention(msg *telego.Message, botUsername string) bool {
|
||||
if botUsername == "" {
|
||||
return false
|
||||
}
|
||||
lowerBot := strings.ToLower(botUsername)
|
||||
|
||||
for _, entity := range msg.Entities {
|
||||
if entity.Type == "mention" && msg.Text != "" {
|
||||
mentioned := msg.Text[entity.Offset : entity.Offset+entity.Length]
|
||||
if strings.EqualFold(mentioned, "@"+botUsername) {
|
||||
return true
|
||||
}
|
||||
// Check both text entities and caption entities (photos use Caption, not Text).
|
||||
for _, pair := range []struct {
|
||||
entities []telego.MessageEntity
|
||||
text string
|
||||
}{
|
||||
{msg.Entities, msg.Text},
|
||||
{msg.CaptionEntities, msg.Caption},
|
||||
} {
|
||||
if pair.text == "" {
|
||||
continue
|
||||
}
|
||||
if entity.Type == "bot_command" && msg.Text != "" {
|
||||
cmdText := msg.Text[entity.Offset : entity.Offset+entity.Length]
|
||||
if strings.Contains(strings.ToLower(cmdText), "@"+strings.ToLower(botUsername)) {
|
||||
return true
|
||||
for _, entity := range pair.entities {
|
||||
if entity.Type == "mention" {
|
||||
mentioned := pair.text[entity.Offset : entity.Offset+entity.Length]
|
||||
if strings.EqualFold(mentioned, "@"+botUsername) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if entity.Type == "bot_command" {
|
||||
cmdText := pair.text[entity.Offset : entity.Offset+entity.Length]
|
||||
if strings.Contains(strings.ToLower(cmdText), "@"+lowerBot) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+strings.ToLower(botUsername)) {
|
||||
// Fallback: substring check in both text and caption
|
||||
if msg.Text != "" && strings.Contains(strings.ToLower(msg.Text), "@"+lowerBot) {
|
||||
return true
|
||||
}
|
||||
if msg.Caption != "" && strings.Contains(strings.ToLower(msg.Caption), "@"+lowerBot) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Reply to bot's message = implicit mention
|
||||
if msg.ReplyToMessage != nil && msg.ReplyToMessage.From != nil {
|
||||
if msg.ReplyToMessage.From.Username == botUsername {
|
||||
return true
|
||||
|
||||
@@ -186,6 +186,22 @@ type ToolPolicySpec struct {
|
||||
Deny []string `json:"deny,omitempty"`
|
||||
AlsoAllow []string `json:"alsoAllow,omitempty"`
|
||||
ByProvider map[string]*ToolPolicySpec `json:"byProvider,omitempty"`
|
||||
Vision *VisionConfig `json:"vision,omitempty"` // per-agent vision provider/model override
|
||||
ImageGen *ImageGenConfig `json:"imageGen,omitempty"` // per-agent image generation config
|
||||
}
|
||||
|
||||
// VisionConfig configures the provider and model for vision tools (read_image).
|
||||
type VisionConfig struct {
|
||||
Provider string `json:"provider,omitempty"` // e.g. "gemini", "anthropic"
|
||||
Model string `json:"model,omitempty"` // e.g. "gemini-2.0-flash"
|
||||
}
|
||||
|
||||
// ImageGenConfig configures the provider and model for image generation (create_image).
|
||||
type ImageGenConfig struct {
|
||||
Provider string `json:"provider,omitempty"` // provider with image gen API (e.g. "openrouter")
|
||||
Model string `json:"model,omitempty"` // e.g. "google/gemini-2.5-flash-image-preview"
|
||||
Size string `json:"size,omitempty"` // default aspect ratio / size
|
||||
Quality string `json:"quality,omitempty"` // "standard" or "hd"
|
||||
}
|
||||
|
||||
type WebToolsConfig struct {
|
||||
|
||||
@@ -220,10 +220,34 @@ func (p *AnthropicProvider) buildRequestBody(model string, req ChatRequest, stre
|
||||
})
|
||||
|
||||
case "user":
|
||||
messages = append(messages, map[string]interface{}{
|
||||
"role": "user",
|
||||
"content": msg.Content,
|
||||
})
|
||||
if len(msg.Images) > 0 {
|
||||
var blocks []map[string]interface{}
|
||||
for _, img := range msg.Images {
|
||||
blocks = append(blocks, map[string]interface{}{
|
||||
"type": "image",
|
||||
"source": map[string]interface{}{
|
||||
"type": "base64",
|
||||
"media_type": img.MimeType,
|
||||
"data": img.Data,
|
||||
},
|
||||
})
|
||||
}
|
||||
if msg.Content != "" {
|
||||
blocks = append(blocks, map[string]interface{}{
|
||||
"type": "text",
|
||||
"text": msg.Content,
|
||||
})
|
||||
}
|
||||
messages = append(messages, map[string]interface{}{
|
||||
"role": "user",
|
||||
"content": blocks,
|
||||
})
|
||||
} else {
|
||||
messages = append(messages, map[string]interface{}{
|
||||
"role": "user",
|
||||
"content": msg.Content,
|
||||
})
|
||||
}
|
||||
|
||||
case "assistant":
|
||||
var blocks []map[string]interface{}
|
||||
|
||||
@@ -18,6 +18,7 @@ type OpenAIProvider struct {
|
||||
name string
|
||||
apiKey string
|
||||
apiBase string
|
||||
chatPath string // defaults to "/chat/completions"
|
||||
defaultModel string
|
||||
client *http.Client
|
||||
retryConfig RetryConfig
|
||||
@@ -33,14 +34,23 @@ func NewOpenAIProvider(name, apiKey, apiBase, defaultModel string) *OpenAIProvid
|
||||
name: name,
|
||||
apiKey: apiKey,
|
||||
apiBase: apiBase,
|
||||
chatPath: "/chat/completions",
|
||||
defaultModel: defaultModel,
|
||||
client: &http.Client{Timeout: 120 * time.Second},
|
||||
retryConfig: DefaultRetryConfig(),
|
||||
}
|
||||
}
|
||||
|
||||
// WithChatPath returns a copy with a custom chat completions path (e.g. "/text/chatcompletion_v2" for MiniMax native API).
|
||||
func (p *OpenAIProvider) WithChatPath(path string) *OpenAIProvider {
|
||||
p.chatPath = path
|
||||
return p
|
||||
}
|
||||
|
||||
func (p *OpenAIProvider) Name() string { return p.name }
|
||||
func (p *OpenAIProvider) DefaultModel() string { return p.defaultModel }
|
||||
func (p *OpenAIProvider) APIKey() string { return p.apiKey }
|
||||
func (p *OpenAIProvider) APIBase() string { return p.apiBase }
|
||||
|
||||
// resolveModel returns the model ID to use for a request.
|
||||
// For OpenRouter, model IDs require a provider prefix (e.g. "anthropic/claude-sonnet-4-5-20250929").
|
||||
@@ -184,7 +194,24 @@ func (p *OpenAIProvider) buildRequestBody(model string, req ChatRequest, stream
|
||||
|
||||
// Include content; omit empty content for assistant messages with tool_calls
|
||||
// (Gemini rejects empty content → "must include at least one parts field").
|
||||
if m.Content != "" || len(m.ToolCalls) == 0 {
|
||||
if m.Role == "user" && len(m.Images) > 0 {
|
||||
var parts []map[string]interface{}
|
||||
for _, img := range m.Images {
|
||||
parts = append(parts, map[string]interface{}{
|
||||
"type": "image_url",
|
||||
"image_url": map[string]interface{}{
|
||||
"url": fmt.Sprintf("data:%s;base64,%s", img.MimeType, img.Data),
|
||||
},
|
||||
})
|
||||
}
|
||||
if m.Content != "" {
|
||||
parts = append(parts, map[string]interface{}{
|
||||
"type": "text",
|
||||
"text": m.Content,
|
||||
})
|
||||
}
|
||||
msg["content"] = parts
|
||||
} else if m.Content != "" || len(m.ToolCalls) == 0 {
|
||||
msg["content"] = m.Content
|
||||
}
|
||||
|
||||
@@ -247,7 +274,7 @@ func (p *OpenAIProvider) doRequest(ctx context.Context, body interface{}) (io.Re
|
||||
return nil, fmt.Errorf("%s: marshal request: %w", p.name, err)
|
||||
}
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+"/chat/completions", bytes.NewReader(data))
|
||||
httpReq, err := http.NewRequestWithContext(ctx, "POST", p.apiBase+p.chatPath, bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s: create request: %w", p.name, err)
|
||||
}
|
||||
|
||||
@@ -42,12 +42,19 @@ type StreamChunk struct {
|
||||
Done bool `json:"done,omitempty"`
|
||||
}
|
||||
|
||||
// ImageContent represents a base64-encoded image for vision-capable models.
|
||||
type ImageContent struct {
|
||||
MimeType string `json:"mime_type"` // e.g. "image/jpeg"
|
||||
Data string `json:"data"` // base64-encoded image bytes
|
||||
}
|
||||
|
||||
// Message represents a conversation message.
|
||||
type Message struct {
|
||||
Role string `json:"role"` // "system", "user", "assistant", "tool"
|
||||
Content string `json:"content"`
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"` // for role="tool" responses
|
||||
Role string `json:"role"` // "system", "user", "assistant", "tool"
|
||||
Content string `json:"content"`
|
||||
Images []ImageContent `json:"images,omitempty"` // vision: base64 images
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"` // for role="tool" responses
|
||||
}
|
||||
|
||||
// ToolCall represents a tool invocation requested by the LLM.
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package tools
|
||||
|
||||
import "context"
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/config"
|
||||
)
|
||||
|
||||
// Tool execution context keys.
|
||||
// These replace mutable setter fields on tool instances, making tools thread-safe
|
||||
@@ -71,3 +75,28 @@ func ToolWorkspaceFromCtx(ctx context.Context) string {
|
||||
v, _ := ctx.Value(ctxWorkspace).(string)
|
||||
return v
|
||||
}
|
||||
|
||||
// --- Vision / ImageGen config (per-agent overrides) ---
|
||||
|
||||
const (
|
||||
ctxVisionConfig toolContextKey = "tool_vision_config"
|
||||
ctxImageGenConfig toolContextKey = "tool_imagegen_config"
|
||||
)
|
||||
|
||||
func WithVisionConfig(ctx context.Context, cfg *config.VisionConfig) context.Context {
|
||||
return context.WithValue(ctx, ctxVisionConfig, cfg)
|
||||
}
|
||||
|
||||
func VisionConfigFromCtx(ctx context.Context) *config.VisionConfig {
|
||||
v, _ := ctx.Value(ctxVisionConfig).(*config.VisionConfig)
|
||||
return v
|
||||
}
|
||||
|
||||
func WithImageGenConfig(ctx context.Context, cfg *config.ImageGenConfig) context.Context {
|
||||
return context.WithValue(ctx, ctxImageGenConfig, cfg)
|
||||
}
|
||||
|
||||
func ImageGenConfigFromCtx(ctx context.Context) *config.ImageGenConfig {
|
||||
v, _ := ctx.Value(ctxImageGenConfig).(*config.ImageGenConfig)
|
||||
return v
|
||||
}
|
||||
|
||||
@@ -0,0 +1,288 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
)
|
||||
|
||||
// credentialProvider is a narrow interface for providers that expose API credentials.
|
||||
type credentialProvider interface {
|
||||
APIKey() string
|
||||
APIBase() string
|
||||
}
|
||||
|
||||
// imageGenProviderPriority is the default order for image generation providers.
|
||||
var imageGenProviderPriority = []string{"openrouter", "gemini", "openai"}
|
||||
|
||||
// imageGenModelDefaults maps provider names to default image generation models.
|
||||
var imageGenModelDefaults = map[string]string{
|
||||
"openrouter": "google/gemini-2.5-flash-image",
|
||||
"openai": "dall-e-3",
|
||||
"gemini": "gemini-2.0-flash-exp",
|
||||
}
|
||||
|
||||
// CreateImageTool generates images using an image generation API.
|
||||
// Uses OpenRouter (Gemini image model) or OpenAI (DALL-E) via per-agent ImageGenConfig.
|
||||
type CreateImageTool struct {
|
||||
registry *providers.Registry
|
||||
}
|
||||
|
||||
func NewCreateImageTool(registry *providers.Registry) *CreateImageTool {
|
||||
return &CreateImageTool{registry: registry}
|
||||
}
|
||||
|
||||
func (t *CreateImageTool) Name() string { return "create_image" }
|
||||
|
||||
func (t *CreateImageTool) Description() string {
|
||||
return "Generate an image from a text description using an image generation model. Returns a MEDIA: path to the generated image file."
|
||||
}
|
||||
|
||||
func (t *CreateImageTool) Parameters() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"prompt": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Text description of the image to generate.",
|
||||
},
|
||||
"aspect_ratio": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Aspect ratio: '1:1' (default), '3:4', '4:3', '9:16', '16:9'.",
|
||||
},
|
||||
},
|
||||
"required": []string{"prompt"},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *CreateImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
|
||||
prompt, _ := args["prompt"].(string)
|
||||
if prompt == "" {
|
||||
return ErrorResult("prompt is required")
|
||||
}
|
||||
aspectRatio, _ := args["aspect_ratio"].(string)
|
||||
if aspectRatio == "" {
|
||||
aspectRatio = "1:1"
|
||||
}
|
||||
|
||||
// Resolve provider from per-agent config or defaults
|
||||
providerName, model := t.resolveConfig(ctx)
|
||||
|
||||
p, err := t.registry.Get(providerName)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("image generation provider %q not available", providerName))
|
||||
}
|
||||
|
||||
cp, ok := p.(credentialProvider)
|
||||
if !ok {
|
||||
return ErrorResult(fmt.Sprintf("provider %q does not expose API credentials for image generation", providerName))
|
||||
}
|
||||
|
||||
slog.Info("create_image: calling image generation API",
|
||||
"provider", providerName, "model", model, "aspect_ratio", aspectRatio)
|
||||
|
||||
imageBytes, usage, err := t.callImageGenAPI(ctx, cp.APIKey(), cp.APIBase(), model, prompt, aspectRatio)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("image generation failed: %v", err))
|
||||
}
|
||||
|
||||
// Save to temp file
|
||||
imagePath := filepath.Join(os.TempDir(), fmt.Sprintf("goclaw_gen_%d.png", time.Now().UnixNano()))
|
||||
if err := os.WriteFile(imagePath, imageBytes, 0644); err != nil {
|
||||
return ErrorResult(fmt.Sprintf("failed to save generated image: %v", err))
|
||||
}
|
||||
|
||||
result := &Result{ForLLM: fmt.Sprintf("MEDIA:%s", imagePath)}
|
||||
result.Provider = providerName
|
||||
result.Model = model
|
||||
if usage != nil {
|
||||
result.Usage = usage
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// resolveConfig returns the provider name and model to use for image generation.
|
||||
func (t *CreateImageTool) resolveConfig(ctx context.Context) (providerName, model string) {
|
||||
// 1. Check per-agent ImageGenConfig from context
|
||||
if cfg := ImageGenConfigFromCtx(ctx); cfg != nil {
|
||||
if cfg.Provider != "" {
|
||||
providerName = cfg.Provider
|
||||
}
|
||||
if cfg.Model != "" {
|
||||
model = cfg.Model
|
||||
}
|
||||
}
|
||||
|
||||
// 2. If provider not set, find first available from priority list
|
||||
if providerName == "" {
|
||||
for _, name := range imageGenProviderPriority {
|
||||
if _, err := t.registry.Get(name); err == nil {
|
||||
providerName = name
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if providerName == "" {
|
||||
providerName = "openrouter" // fallback even if unavailable (error handled later)
|
||||
}
|
||||
|
||||
// 3. If model not set, use default for this provider
|
||||
if model == "" {
|
||||
if m, ok := imageGenModelDefaults[providerName]; ok {
|
||||
model = m
|
||||
}
|
||||
}
|
||||
|
||||
return providerName, model
|
||||
}
|
||||
|
||||
// callImageGenAPI calls the OpenAI-compatible image generation endpoint.
|
||||
// Works with OpenRouter (modalities: ["image","text"]) and OpenAI (/images/generations).
|
||||
func (t *CreateImageTool) callImageGenAPI(ctx context.Context, apiKey, apiBase, model, prompt, aspectRatio string) ([]byte, *providers.Usage, error) {
|
||||
// OpenRouter / OpenAI-compat: use chat completions with modalities
|
||||
body := map[string]interface{}{
|
||||
"model": model,
|
||||
"messages": []map[string]interface{}{
|
||||
{"role": "user", "content": prompt},
|
||||
},
|
||||
"modalities": []string{"image", "text"},
|
||||
}
|
||||
if aspectRatio != "" && aspectRatio != "1:1" {
|
||||
body["image_config"] = map[string]interface{}{
|
||||
"aspect_ratio": aspectRatio,
|
||||
}
|
||||
}
|
||||
|
||||
jsonBody, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("marshal request: %w", err)
|
||||
}
|
||||
|
||||
url := strings.TrimRight(apiBase, "/") + "/chat/completions"
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+apiKey)
|
||||
|
||||
client := &http.Client{Timeout: 120 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("http request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("read response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, nil, fmt.Errorf("API error %d: %s", resp.StatusCode, truncateBytes(respBody, 500))
|
||||
}
|
||||
|
||||
return t.parseImageResponse(respBody)
|
||||
}
|
||||
|
||||
// parseImageResponse extracts base64 image data from the OpenAI-compat chat response.
|
||||
// Looks for images in choices[0].message.content (multipart) or choices[0].message.images.
|
||||
func (t *CreateImageTool) parseImageResponse(respBody []byte) ([]byte, *providers.Usage, error) {
|
||||
var resp struct {
|
||||
Choices []struct {
|
||||
Message struct {
|
||||
Content interface{} `json:"content"`
|
||||
Images []struct {
|
||||
ImageURL struct {
|
||||
URL string `json:"url"`
|
||||
} `json:"image_url"`
|
||||
} `json:"images"`
|
||||
} `json:"message"`
|
||||
} `json:"choices"`
|
||||
Usage *struct {
|
||||
PromptTokens int `json:"prompt_tokens"`
|
||||
CompletionTokens int `json:"completion_tokens"`
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
} `json:"usage"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(respBody, &resp); err != nil {
|
||||
return nil, nil, fmt.Errorf("parse response: %w", err)
|
||||
}
|
||||
|
||||
if len(resp.Choices) == 0 {
|
||||
return nil, nil, fmt.Errorf("no choices in response")
|
||||
}
|
||||
|
||||
msg := resp.Choices[0].Message
|
||||
|
||||
// Try images array first (OpenRouter format)
|
||||
for _, img := range msg.Images {
|
||||
if imageBytes, err := decodeDataURL(img.ImageURL.URL); err == nil {
|
||||
return imageBytes, convertUsage(resp.Usage), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Try multipart content array (some providers return content as array of parts)
|
||||
if parts, ok := msg.Content.([]interface{}); ok {
|
||||
for _, part := range parts {
|
||||
if m, ok := part.(map[string]interface{}); ok {
|
||||
if m["type"] == "image_url" {
|
||||
if imgURL, ok := m["image_url"].(map[string]interface{}); ok {
|
||||
if url, ok := imgURL["url"].(string); ok {
|
||||
if imageBytes, err := decodeDataURL(url); err == nil {
|
||||
return imageBytes, convertUsage(resp.Usage), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil, fmt.Errorf("no image data found in response")
|
||||
}
|
||||
|
||||
// decodeDataURL decodes a data:image/...;base64,... URL into raw bytes.
|
||||
func decodeDataURL(dataURL string) ([]byte, error) {
|
||||
// Format: data:image/png;base64,iVBORw0KGgo...
|
||||
idx := strings.Index(dataURL, ";base64,")
|
||||
if idx < 0 {
|
||||
return nil, fmt.Errorf("not a base64 data URL")
|
||||
}
|
||||
b64 := dataURL[idx+8:]
|
||||
return base64.StdEncoding.DecodeString(b64)
|
||||
}
|
||||
|
||||
func convertUsage(u *struct {
|
||||
PromptTokens int `json:"prompt_tokens"`
|
||||
CompletionTokens int `json:"completion_tokens"`
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
}) *providers.Usage {
|
||||
if u == nil {
|
||||
return nil
|
||||
}
|
||||
return &providers.Usage{
|
||||
PromptTokens: u.PromptTokens,
|
||||
CompletionTokens: u.CompletionTokens,
|
||||
TotalTokens: u.TotalTokens,
|
||||
}
|
||||
}
|
||||
|
||||
func truncateBytes(b []byte, max int) string {
|
||||
if len(b) <= max {
|
||||
return string(b)
|
||||
}
|
||||
return string(b[:max]) + "..."
|
||||
}
|
||||
@@ -25,7 +25,7 @@ var toolGroups = map[string][]string{
|
||||
"browser", "canvas", "nodes", "cron", "message", "gateway",
|
||||
"agents_list", "sessions_list", "sessions_history", "sessions_send",
|
||||
"sessions_spawn", "subagents", "session_status",
|
||||
"memory_search", "memory_get", "web_search", "web_fetch", "image",
|
||||
"memory_search", "memory_get", "web_search", "web_fetch", "read_image", "create_image",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ func UnregisterToolGroup(name string) {
|
||||
// Tool profiles define preset allow sets.
|
||||
var toolProfiles = map[string][]string{
|
||||
"minimal": {"session_status"},
|
||||
"coding": {"group:fs", "group:runtime", "group:sessions", "group:memory", "image"},
|
||||
"coding": {"group:fs", "group:runtime", "group:sessions", "group:memory", "read_image", "create_image"},
|
||||
"messaging": {"group:messaging", "sessions_list", "sessions_history", "sessions_send", "session_status"},
|
||||
"full": {}, // empty = no restrictions
|
||||
}
|
||||
|
||||
@@ -0,0 +1,139 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
)
|
||||
|
||||
// --- Context helpers for media images ---
|
||||
|
||||
const ctxMediaImages toolContextKey = "tool_media_images"
|
||||
|
||||
// WithMediaImages stores base64-encoded images in context for read_image tool access.
|
||||
func WithMediaImages(ctx context.Context, images []providers.ImageContent) context.Context {
|
||||
return context.WithValue(ctx, ctxMediaImages, images)
|
||||
}
|
||||
|
||||
// MediaImagesFromCtx retrieves stored images from context.
|
||||
func MediaImagesFromCtx(ctx context.Context) []providers.ImageContent {
|
||||
v, _ := ctx.Value(ctxMediaImages).([]providers.ImageContent)
|
||||
return v
|
||||
}
|
||||
|
||||
// --- ReadImageTool ---
|
||||
|
||||
// visionProviderPriority is the order in which providers are tried for vision.
|
||||
var visionProviderPriority = []string{"gemini", "anthropic", "openrouter"}
|
||||
|
||||
// visionModelOverrides maps provider names to preferred vision models.
|
||||
// Providers not listed here use their default model.
|
||||
var visionModelOverrides = map[string]string{
|
||||
"openrouter": "google/gemini-2.0-flash-001",
|
||||
}
|
||||
|
||||
// ReadImageTool uses a vision-capable provider to describe images attached to the current message.
|
||||
type ReadImageTool struct {
|
||||
registry *providers.Registry
|
||||
}
|
||||
|
||||
func NewReadImageTool(registry *providers.Registry) *ReadImageTool {
|
||||
return &ReadImageTool{registry: registry}
|
||||
}
|
||||
|
||||
func (t *ReadImageTool) Name() string { return "read_image" }
|
||||
|
||||
func (t *ReadImageTool) Description() string {
|
||||
return "Analyze images attached to the current message using a vision model. Use this when you see <media:image> tags but cannot view images directly."
|
||||
}
|
||||
|
||||
func (t *ReadImageTool) Parameters() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"prompt": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "What you want to know about the image(s). E.g. 'Describe this image in detail' or 'What text is in this image?'",
|
||||
},
|
||||
},
|
||||
"required": []string{"prompt"},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *ReadImageTool) Execute(ctx context.Context, args map[string]interface{}) *Result {
|
||||
prompt, _ := args["prompt"].(string)
|
||||
if prompt == "" {
|
||||
prompt = "Describe this image in detail."
|
||||
}
|
||||
|
||||
images := MediaImagesFromCtx(ctx)
|
||||
if len(images) == 0 {
|
||||
return ErrorResult("No images available in this conversation. The user may not have sent an image.")
|
||||
}
|
||||
|
||||
// Find a vision-capable provider (per-agent config > hardcoded priority)
|
||||
provider, model, err := t.resolveVisionProviderWithConfig(ctx)
|
||||
if err != nil {
|
||||
return ErrorResult(err.Error())
|
||||
}
|
||||
|
||||
slog.Info("read_image: calling vision provider", "provider", provider.Name(), "model", model, "images", len(images))
|
||||
|
||||
resp, err := provider.Chat(ctx, providers.ChatRequest{
|
||||
Messages: []providers.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: prompt,
|
||||
Images: images,
|
||||
},
|
||||
},
|
||||
Model: model,
|
||||
Options: map[string]interface{}{
|
||||
"max_tokens": 1024,
|
||||
"temperature": 0.3,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("Vision provider error: %v", err))
|
||||
}
|
||||
|
||||
result := NewResult(resp.Content)
|
||||
result.Usage = resp.Usage
|
||||
result.Provider = provider.Name()
|
||||
result.Model = model
|
||||
return result
|
||||
}
|
||||
|
||||
// resolveVisionProviderWithConfig checks per-agent VisionConfig first, then falls back to hardcoded priority.
|
||||
func (t *ReadImageTool) resolveVisionProviderWithConfig(ctx context.Context) (providers.Provider, string, error) {
|
||||
if cfg := VisionConfigFromCtx(ctx); cfg != nil && cfg.Provider != "" {
|
||||
p, err := t.registry.Get(cfg.Provider)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("configured vision provider %q not available: %w", cfg.Provider, err)
|
||||
}
|
||||
model := cfg.Model
|
||||
if model == "" {
|
||||
model = p.DefaultModel()
|
||||
}
|
||||
return p, model, nil
|
||||
}
|
||||
return t.resolveVisionProvider()
|
||||
}
|
||||
|
||||
// resolveVisionProvider finds the first available vision-capable provider.
|
||||
func (t *ReadImageTool) resolveVisionProvider() (providers.Provider, string, error) {
|
||||
for _, name := range visionProviderPriority {
|
||||
p, err := t.registry.Get(name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
model := p.DefaultModel()
|
||||
if override, ok := visionModelOverrides[name]; ok {
|
||||
model = override
|
||||
}
|
||||
return p, model, nil
|
||||
}
|
||||
return nil, "", fmt.Errorf("no vision-capable provider available (need one of: %v)", visionProviderPriority)
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
package tools
|
||||
|
||||
import "github.com/nextlevelbuilder/goclaw/internal/providers"
|
||||
|
||||
// Result is the unified return type from tool execution.
|
||||
type Result struct {
|
||||
ForLLM string `json:"for_llm"` // content sent to the LLM
|
||||
@@ -8,6 +10,12 @@ type Result struct {
|
||||
IsError bool `json:"is_error"` // marks error
|
||||
Async bool `json:"async"` // running asynchronously
|
||||
Err error `json:"-"` // internal error (not serialized)
|
||||
|
||||
// Usage holds token usage from tools that make internal LLM calls (e.g. read_image).
|
||||
// When set, the agent loop records these on the tool span for tracing.
|
||||
Usage *providers.Usage `json:"-"`
|
||||
Provider string `json:"-"` // provider name (for tool span metadata)
|
||||
Model string `json:"-"` // model used (for tool span metadata)
|
||||
}
|
||||
|
||||
func NewResult(forLLM string) *Result {
|
||||
|
||||
@@ -255,7 +255,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
|
||||
{span.input_preview && (
|
||||
<div>
|
||||
<p className="text-xs text-muted-foreground">Input:</p>
|
||||
<pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
|
||||
<pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
|
||||
{span.input_preview}
|
||||
</pre>
|
||||
</div>
|
||||
@@ -263,7 +263,7 @@ function SpanTreeNode({ node, depth }: { node: SpanNode; depth: number }) {
|
||||
{span.output_preview && (
|
||||
<div>
|
||||
<p className="text-xs text-muted-foreground">Output:</p>
|
||||
<pre className="mt-1 max-h-[40vh] overflow-y-auto whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
|
||||
<pre className="mt-1 max-h-[40vh] overflow-y-auto break-all whitespace-pre-wrap rounded bg-muted/50 p-2 text-xs">
|
||||
{span.output_preview}
|
||||
</pre>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user