Files
goclaw/internal/tools/read_document_resolve.go
T
Luan Vu 405a753239 fix: resolve media provider type from DB instead of guessing from name (#154)
Media tools (create_image, create_video, create_audio, read_audio,
read_video, read_document) routed API calls based on provider name
pattern matching (e.g. strings.HasPrefix(name, "gemini")). This breaks
when users give custom names to DB providers — a Gemini provider named
"chatgpt-sap-het" would be misrouted to the OpenAI-compat endpoint,
causing 404 errors.

Fix: carry the DB provider_type through OpenAIProvider, resolve it via
typedProvider interface in ExecuteWithChain, and inject as _provider_type
param for callProvider routing. Name-based heuristic kept as fallback
for config-file providers that don't have a DB type.

Co-authored-by: Luvu182 <208665161+Luvu182@users.noreply.github.com>
2026-03-11 18:32:51 +07:00

121 lines
3.9 KiB
Go

package tools
import (
"context"
"encoding/base64"
"fmt"
"log/slog"
"path/filepath"
"strings"
"github.com/nextlevelbuilder/goclaw/internal/providers"
)
// resolveDocumentFile finds the document file path from context MediaRefs.
func (t *ReadDocumentTool) resolveDocumentFile(ctx context.Context, mediaID string) (path, mime string, err error) {
if t.mediaLoader == nil {
return "", "", fmt.Errorf("no media storage configured — cannot access document files")
}
refs := MediaDocRefsFromCtx(ctx)
if len(refs) == 0 {
return "", "", fmt.Errorf("no documents available in this conversation. The user may not have sent a document.")
}
// Find specific media_id or use most recent document.
var ref *providers.MediaRef
if mediaID != "" {
for i := range refs {
if refs[i].ID == mediaID {
ref = &refs[i]
break
}
}
if ref == nil {
return "", "", fmt.Errorf("document with media_id %q not found in conversation", mediaID)
}
} else {
// Use the last (most recent) document ref.
ref = &refs[len(refs)-1]
}
p, err := t.mediaLoader.LoadPath(ref.ID)
if err != nil {
return "", "", fmt.Errorf("document file not found: %v", err)
}
// Determine MIME type: prefer ref's stored MIME, fall back to extension.
mime = ref.MimeType
if mime == "" || mime == "application/octet-stream" {
mime = mimeFromDocExt(filepath.Ext(p))
}
return p, mime, nil
}
// callProvider dispatches document analysis to the appropriate provider API.
// For Gemini: uses native generateContent API (supports PDF natively).
// For others: uses standard Chat API with base64 document.
func (t *ReadDocumentTool) callProvider(ctx context.Context, cp credentialProvider, providerName, model string, params map[string]any) ([]byte, *providers.Usage, error) {
prompt := GetParamString(params, "prompt", "Analyze this document and describe its contents.")
data, _ := params["data"].([]byte)
mime := GetParamString(params, "mime", "application/octet-stream")
// Gemini: use native API (requires credentials; OpenAI-compat endpoint doesn't support non-image MIME types).
ptype := GetParamString(params, "_provider_type", providerTypeFromName(providerName))
if cp != nil && ptype == "gemini" {
slog.Info("read_document: using gemini native API",
"provider", providerName, "model", model,
"doc_size", len(data), "mime", mime)
resp, err := geminiNativeDocumentCall(ctx, cp.APIKey(), model, prompt, data, mime)
if err != nil {
return nil, nil, fmt.Errorf("gemini native call: %w", err)
}
return []byte(resp.Content), resp.Usage, nil
}
// Other providers: use standard Chat API with document as base64 image_url.
p, err := t.registry.Get(providerName)
if err != nil {
return nil, nil, fmt.Errorf("provider %q not available: %w", providerName, err)
}
slog.Info("read_document: using chat API", "provider", providerName, "model", model, "doc_size", len(data))
resp, err := p.Chat(ctx, providers.ChatRequest{
Messages: []providers.Message{
{
Role: "user",
Content: prompt,
Images: []providers.ImageContent{{MimeType: mime, Data: base64.StdEncoding.EncodeToString(data)}},
},
},
Model: model,
Options: map[string]any{
"max_tokens": 16384,
"temperature": 0.2,
},
})
if err != nil {
return nil, nil, fmt.Errorf("chat call: %w", err)
}
return []byte(resp.Content), resp.Usage, nil
}
// mimeFromDocExt returns MIME type for document file extensions.
func mimeFromDocExt(ext string) string {
switch strings.ToLower(ext) {
case ".pdf":
return "application/pdf"
case ".doc", ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
case ".xls", ".xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
case ".ppt", ".pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
case ".csv":
return "text/csv"
default:
return "application/octet-stream"
}
}