mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 06:10:46 +00:00
c25e770d43
* feat(ui): multi-skill upload with client-side validation
Allow uploading multiple skill ZIP files at once with pre-upload
validation. JSZip parses each ZIP client-side to verify SKILL.md
presence, frontmatter format, and slug validity before upload.
- Add JSZip dependency (lazy-loaded, code-split ~30KB gzip)
- Create validate-skill-zip.ts mirroring server-side checks
- Rewrite skill-upload-dialog for multi-file with status badges
- Add concurrent validation, sequential upload with per-file progress
- Add empty SKILL.md check to backend upload handler
- Add i18n keys for all new UI strings (en/vi/zh)
* fix(ui): duplicate entries and validation hang in multi-skill upload
- Move pending list construction to assignment inside updater return
to prevent StrictMode double-invoke from pushing duplicates
- Wrap per-file validateSkillZip in try/catch so one failure doesn't
block Promise.all and leave entries stuck in "validating" state
* fix(ui): use static import for JSZip instead of dynamic import
Dynamic import("jszip") fails in browser - bare module specifiers
don't resolve at runtime. Use static import which Vite handles
via its module graph and code-splits automatically.
* feat(ui): add inline visibility toggle on skills table
Click the visibility badge on managed skills to cycle through
private → internal → public. File-based skills stay read-only.
* fix(ui): move dedup logic outside state updater in upload dialog
Avoids reading stale entries inside functional updater. Builds
pending list from current entries state before calling setEntries.
* fix(ui): auto-select first active agent when current agent unavailable
When agents load from API, if the current selected agent is not in the active agents list, automatically select the first available active agent instead of remaining unset. Prevents chat page from being unable to send messages when default agent selection is invalid.
* feat(ui): make agent display name editable in setup wizard
Allow users to customize the agent display name during onboarding instead of keeping it hardcoded to "GoClaw". Removed read-only state from the display name input and added a placeholder for guidance.
* feat: add document path enrichment and media filename support
Backend changes:
- enrichDocumentPaths() in agent/media.go: injects persisted file paths into <media:document> tags
- Document paths allow skills (e.g. pdf skill via exec) to access files directly
- chat.go: support new media format {path, filename} alongside legacy string paths
- Updated read_document tool description to guide agent on using path attribute
- Docker: add pypdf to Python dependencies for PDF processing
- Softened MUST language in read_* tool descriptions (changed to Call this)
Frontend changes:
- chat-input.tsx: attach filename with each uploaded file in media payload
- use-chat-send.ts: send media as {path, filename} objects instead of just paths
- i18n: add "uploaded_files" text in en, vi, zh locales
- chat-page.tsx: minor adjustment for media handling
Enables skills to process uploaded documents directly without intermediate copying.
158 lines
5.3 KiB
Go
158 lines
5.3 KiB
Go
package tools
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/nextlevelbuilder/goclaw/internal/providers"
|
|
)
|
|
|
|
// textReadableMIMEs are MIME types whose content can be returned directly without LLM analysis.
|
|
var textReadableMIMEs = map[string]bool{
|
|
"application/json": true,
|
|
"text/csv": true,
|
|
"text/plain": true,
|
|
"text/html": true,
|
|
"text/xml": true,
|
|
"application/xml": true,
|
|
"text/markdown": true,
|
|
"application/javascript": true,
|
|
"text/css": true,
|
|
"application/yaml": true,
|
|
"text/yaml": true,
|
|
}
|
|
|
|
// documentMaxTextBytes is the max size for direct text return (500KB).
|
|
const documentMaxTextBytes = 500 * 1024
|
|
|
|
// --- Context helpers for media documents ---
|
|
|
|
const ctxMediaDocRefs toolContextKey = "tool_media_doc_refs"
|
|
|
|
// WithMediaDocRefs stores document MediaRefs in context for read_document tool access.
|
|
func WithMediaDocRefs(ctx context.Context, refs []providers.MediaRef) context.Context {
|
|
return context.WithValue(ctx, ctxMediaDocRefs, refs)
|
|
}
|
|
|
|
// MediaDocRefsFromCtx retrieves stored document MediaRefs from context.
|
|
func MediaDocRefsFromCtx(ctx context.Context) []providers.MediaRef {
|
|
v, _ := ctx.Value(ctxMediaDocRefs).([]providers.MediaRef)
|
|
return v
|
|
}
|
|
|
|
// --- ReadDocumentTool ---
|
|
|
|
// documentMaxBytes is the max file size for document analysis (20MB).
|
|
const documentMaxBytes = 20 * 1024 * 1024
|
|
|
|
// documentProviderPriority is the order in which providers are tried for document analysis.
|
|
// Gemini has best native PDF support (50MB, 258 tokens/page).
|
|
// "alibaba" is included as an alias for dashscope (common DB registration name).
|
|
var documentProviderPriority = []string{"gemini", "anthropic", "openrouter", "dashscope"}
|
|
|
|
// documentModelDefaults maps provider names to preferred document-capable models.
|
|
var documentModelDefaults = map[string]string{
|
|
"gemini": "gemini-2.5-flash",
|
|
"openrouter": "google/gemini-2.5-flash",
|
|
"dashscope": "qwen-vl-max",
|
|
}
|
|
|
|
// ReadDocumentTool uses a document-capable provider to analyze files
|
|
// attached to the current conversation. Follows same pattern as ReadImageTool.
|
|
type ReadDocumentTool struct {
|
|
registry *providers.Registry
|
|
mediaLoader MediaPathLoader
|
|
}
|
|
|
|
func NewReadDocumentTool(registry *providers.Registry, mediaLoader MediaPathLoader) *ReadDocumentTool {
|
|
return &ReadDocumentTool{registry: registry, mediaLoader: mediaLoader}
|
|
}
|
|
|
|
func (t *ReadDocumentTool) Name() string { return "read_document" }
|
|
|
|
func (t *ReadDocumentTool) Description() string {
|
|
return "Analyze documents (PDF, DOCX, images of documents, etc.) attached to the conversation. " +
|
|
"Use when you see <media:document> tags and need to extract or analyze document content. " +
|
|
"Specify what you want to extract or analyze."
|
|
}
|
|
|
|
func (t *ReadDocumentTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"prompt": map[string]any{
|
|
"type": "string",
|
|
"description": "What to analyze. E.g. 'Extract all tables', 'Summarize key findings', 'What does page 3 say?'",
|
|
},
|
|
"media_id": map[string]any{
|
|
"type": "string",
|
|
"description": "Optional: specific media_id from <media:document> tag. If omitted, uses most recent document.",
|
|
},
|
|
},
|
|
"required": []string{"prompt"},
|
|
}
|
|
}
|
|
|
|
func (t *ReadDocumentTool) Execute(ctx context.Context, args map[string]any) *Result {
|
|
prompt, _ := args["prompt"].(string)
|
|
if prompt == "" {
|
|
prompt = "Analyze this document and describe its contents."
|
|
}
|
|
mediaID, _ := args["media_id"].(string)
|
|
|
|
// Resolve document file path from MediaRefs in context.
|
|
docPath, docMime, err := t.resolveDocumentFile(ctx, mediaID)
|
|
if err != nil {
|
|
return ErrorResult(err.Error())
|
|
}
|
|
|
|
slog.Info("read_document: resolved file", "path", docPath, "mime", docMime, "media_id", mediaID)
|
|
|
|
// Read document file.
|
|
data, err := os.ReadFile(docPath)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("Failed to read document file: %v", err))
|
|
}
|
|
slog.Info("read_document: file loaded", "size_bytes", len(data))
|
|
if len(data) > documentMaxBytes {
|
|
return ErrorResult(fmt.Sprintf("Document too large: %d bytes (max %d)", len(data), documentMaxBytes))
|
|
}
|
|
|
|
// Fast path: text-readable files — return content directly without LLM.
|
|
if textReadableMIMEs[docMime] || strings.HasPrefix(docMime, "text/") {
|
|
content := string(data)
|
|
if len(data) > documentMaxTextBytes {
|
|
content = content[:documentMaxTextBytes] + "\n\n[... truncated at 500KB ...]"
|
|
}
|
|
slog.Info("read_document: returning text content directly", "mime", docMime, "size", len(data))
|
|
return NewResult(content)
|
|
}
|
|
|
|
chain := ResolveMediaProviderChain(ctx, "read_document", "", "",
|
|
documentProviderPriority, documentModelDefaults, t.registry)
|
|
|
|
// Inject prompt, data, and mime into each chain entry's params
|
|
for i := range chain {
|
|
if chain[i].Params == nil {
|
|
chain[i].Params = make(map[string]any)
|
|
}
|
|
chain[i].Params["prompt"] = prompt
|
|
chain[i].Params["data"] = data
|
|
chain[i].Params["mime"] = docMime
|
|
}
|
|
|
|
chainResult, err := ExecuteWithChain(ctx, chain, t.registry, t.callProvider)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("Document analysis failed: %v", err))
|
|
}
|
|
|
|
result := NewResult(string(chainResult.Data))
|
|
result.Usage = chainResult.Usage
|
|
result.Provider = chainResult.Provider
|
|
result.Model = chainResult.Model
|
|
return result
|
|
}
|