mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 00:13:42 +00:00
da98a54393
Screenshots saved to {workspace}/screenshots/, TTS audio to {workspace}/tts/.
Agents can now access these files via read_file. Falls back to os.TempDir()
when workspace is unavailable.
425 lines
13 KiB
Go
425 lines
13 KiB
Go
package browser
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/nextlevelbuilder/goclaw/internal/store"
|
|
"github.com/nextlevelbuilder/goclaw/internal/tools"
|
|
)
|
|
|
|
// BrowserTool implements tools.Tool for browser automation.
|
|
type BrowserTool struct {
|
|
manager *Manager
|
|
}
|
|
|
|
// NewBrowserTool creates a BrowserTool wrapping a Manager.
|
|
func NewBrowserTool(manager *Manager) *BrowserTool {
|
|
return &BrowserTool{manager: manager}
|
|
}
|
|
|
|
func (t *BrowserTool) Name() string { return "browser" }
|
|
|
|
func (t *BrowserTool) Description() string {
|
|
return `Control a browser to navigate web pages, take accessibility snapshots, and interact with elements.
|
|
|
|
Actions:
|
|
- status: Get browser status
|
|
- start: Launch browser
|
|
- stop: Close browser
|
|
- tabs: List open tabs
|
|
- open: Open a new tab (requires targetUrl)
|
|
- close: Close a tab (requires targetId)
|
|
- snapshot: Get page accessibility tree with element refs (use targetId, maxChars, interactive, compact, depth)
|
|
- screenshot: Capture page screenshot (use targetId, fullPage)
|
|
- navigate: Navigate tab to URL (requires targetId, targetUrl)
|
|
- console: Get browser console messages (requires targetId)
|
|
- act: Interact with elements (requires request object with kind, ref, etc.)
|
|
|
|
Act kinds: click, type, press, hover, wait, evaluate
|
|
- click: Click element (request: {kind:"click", ref:"e1"})
|
|
- type: Type text (request: {kind:"type", ref:"e1", text:"hello"})
|
|
- press: Press key (request: {kind:"press", key:"Enter"})
|
|
- hover: Hover element (request: {kind:"hover", ref:"e1"})
|
|
- wait: Wait for condition (request: {kind:"wait", timeMs:1000} or {kind:"wait", text:"loaded"})
|
|
- evaluate: Run JavaScript (request: {kind:"evaluate", fn:"document.title"})
|
|
|
|
Workflow: start → open URL → snapshot (get refs) → act (use refs) → snapshot again`
|
|
}
|
|
|
|
func (t *BrowserTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"action": map[string]any{
|
|
"type": "string",
|
|
"enum": []string{"status", "start", "stop", "tabs", "open", "close", "snapshot", "screenshot", "navigate", "console", "act"},
|
|
"description": "The browser action to perform",
|
|
},
|
|
"targetUrl": map[string]any{
|
|
"type": "string",
|
|
"description": "URL for open/navigate actions",
|
|
},
|
|
"targetId": map[string]any{
|
|
"type": "string",
|
|
"description": "Tab target ID (omit for current tab)",
|
|
},
|
|
"maxChars": map[string]any{
|
|
"type": "number",
|
|
"description": "Max characters for snapshot (default 8000)",
|
|
},
|
|
"interactive": map[string]any{
|
|
"type": "boolean",
|
|
"description": "Only show interactive elements in snapshot",
|
|
},
|
|
"compact": map[string]any{
|
|
"type": "boolean",
|
|
"description": "Remove empty structural elements from snapshot",
|
|
},
|
|
"depth": map[string]any{
|
|
"type": "number",
|
|
"description": "Max depth for snapshot tree",
|
|
},
|
|
"fullPage": map[string]any{
|
|
"type": "boolean",
|
|
"description": "Capture full page screenshot",
|
|
},
|
|
"timeoutMs": map[string]any{
|
|
"type": "number",
|
|
"description": "Timeout in milliseconds for actions",
|
|
},
|
|
"request": map[string]any{
|
|
"type": "object",
|
|
"description": "Action request for 'act' command",
|
|
"properties": map[string]any{
|
|
"kind": map[string]any{
|
|
"type": "string",
|
|
"enum": []string{"click", "type", "press", "hover", "wait", "evaluate"},
|
|
"description": "The interaction kind",
|
|
},
|
|
"ref": map[string]any{
|
|
"type": "string",
|
|
"description": "Element ref from snapshot (e.g. e1, e2)",
|
|
},
|
|
"text": map[string]any{
|
|
"type": "string",
|
|
"description": "Text to type",
|
|
},
|
|
"key": map[string]any{
|
|
"type": "string",
|
|
"description": "Key to press (e.g. Enter, Tab, Escape)",
|
|
},
|
|
"submit": map[string]any{
|
|
"type": "boolean",
|
|
"description": "Press Enter after typing",
|
|
},
|
|
"fn": map[string]any{
|
|
"type": "string",
|
|
"description": "JavaScript to evaluate",
|
|
},
|
|
"timeMs": map[string]any{
|
|
"type": "number",
|
|
"description": "Wait time in milliseconds",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
"required": []string{"action"},
|
|
}
|
|
}
|
|
|
|
func (t *BrowserTool) Execute(ctx context.Context, args map[string]any) *tools.Result {
|
|
action, _ := args["action"].(string)
|
|
if action == "" {
|
|
return tools.ErrorResult("action is required")
|
|
}
|
|
|
|
// Propagate tenant ID from store context to browser context for page isolation.
|
|
if tid := store.TenantIDFromContext(ctx); tid.String() != "00000000-0000-0000-0000-000000000000" {
|
|
ctx = WithTenantID(ctx, tid.String())
|
|
}
|
|
|
|
// Auto-start browser for actions that need it
|
|
switch action {
|
|
case "open", "snapshot", "screenshot", "navigate", "act", "tabs":
|
|
if err := t.manager.Start(ctx); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("failed to start browser: %v", err))
|
|
}
|
|
}
|
|
|
|
// Apply per-action timeout for heavy operations
|
|
switch action {
|
|
case "open", "navigate", "snapshot", "screenshot", "act":
|
|
timeout := t.manager.ActionTimeout()
|
|
if ms, ok := args["timeoutMs"].(float64); ok && ms > 0 {
|
|
timeout = time.Duration(ms) * time.Millisecond
|
|
}
|
|
var cancel context.CancelFunc
|
|
ctx, cancel = context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
}
|
|
|
|
switch action {
|
|
case "status":
|
|
return t.handleStatus()
|
|
case "start":
|
|
return t.handleStart(ctx)
|
|
case "stop":
|
|
return t.handleStop(ctx)
|
|
case "tabs":
|
|
return t.handleTabs(ctx)
|
|
case "open":
|
|
return t.handleOpen(ctx, args)
|
|
case "close":
|
|
return t.handleClose(ctx, args)
|
|
case "snapshot":
|
|
return t.handleSnapshot(ctx, args)
|
|
case "screenshot":
|
|
return t.handleScreenshot(ctx, args)
|
|
case "navigate":
|
|
return t.handleNavigate(ctx, args)
|
|
case "console":
|
|
return t.handleConsole(ctx, args)
|
|
case "act":
|
|
return t.handleAct(ctx, args)
|
|
default:
|
|
return tools.ErrorResult(fmt.Sprintf("unknown action: %s", action))
|
|
}
|
|
}
|
|
|
|
func (t *BrowserTool) handleStatus() *tools.Result {
|
|
status := t.manager.Status()
|
|
return jsonResult(status)
|
|
}
|
|
|
|
func (t *BrowserTool) handleStart(ctx context.Context) *tools.Result {
|
|
if err := t.manager.Start(ctx); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("failed to start browser: %v", err))
|
|
}
|
|
return tools.NewResult("Browser started successfully.")
|
|
}
|
|
|
|
func (t *BrowserTool) handleStop(ctx context.Context) *tools.Result {
|
|
if err := t.manager.Stop(ctx); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("failed to stop browser: %v", err))
|
|
}
|
|
return tools.NewResult("Browser stopped.")
|
|
}
|
|
|
|
func (t *BrowserTool) handleTabs(ctx context.Context) *tools.Result {
|
|
tabs, err := t.manager.ListTabs(ctx)
|
|
if err != nil {
|
|
return tools.ErrorResult(err.Error())
|
|
}
|
|
return jsonResult(tabs)
|
|
}
|
|
|
|
func (t *BrowserTool) handleOpen(ctx context.Context, args map[string]any) *tools.Result {
|
|
url, _ := args["targetUrl"].(string)
|
|
if url == "" {
|
|
return tools.ErrorResult("targetUrl is required for open action")
|
|
}
|
|
tab, err := t.manager.OpenTab(ctx, url)
|
|
if err != nil {
|
|
return tools.ErrorResult(err.Error())
|
|
}
|
|
return jsonResult(tab)
|
|
}
|
|
|
|
func (t *BrowserTool) handleClose(ctx context.Context, args map[string]any) *tools.Result {
|
|
targetID, _ := args["targetId"].(string)
|
|
if err := t.manager.CloseTab(ctx, targetID); err != nil {
|
|
return tools.ErrorResult(err.Error())
|
|
}
|
|
return tools.NewResult("Tab closed.")
|
|
}
|
|
|
|
func (t *BrowserTool) handleSnapshot(ctx context.Context, args map[string]any) *tools.Result {
|
|
targetID, _ := args["targetId"].(string)
|
|
opts := DefaultSnapshotOptions()
|
|
|
|
if mc, ok := args["maxChars"].(float64); ok {
|
|
opts.MaxChars = int(mc)
|
|
}
|
|
if inter, ok := args["interactive"].(bool); ok {
|
|
opts.Interactive = inter
|
|
}
|
|
if comp, ok := args["compact"].(bool); ok {
|
|
opts.Compact = comp
|
|
}
|
|
if d, ok := args["depth"].(float64); ok {
|
|
opts.MaxDepth = int(d)
|
|
}
|
|
|
|
snap, err := t.manager.Snapshot(ctx, targetID, opts)
|
|
if err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("snapshot failed: %v", err))
|
|
}
|
|
|
|
// Return snapshot text directly (optimized for LLM consumption)
|
|
header := fmt.Sprintf("Page: %s\nURL: %s\nTargetID: %s\nStats: %d refs, %d interactive\n\n",
|
|
snap.Title, snap.URL, snap.TargetID, snap.Stats.Refs, snap.Stats.Interactive)
|
|
return tools.NewResult(header + snap.Snapshot)
|
|
}
|
|
|
|
func (t *BrowserTool) handleScreenshot(ctx context.Context, args map[string]any) *tools.Result {
|
|
targetID, _ := args["targetId"].(string)
|
|
fullPage, _ := args["fullPage"].(bool)
|
|
|
|
data, err := t.manager.Screenshot(ctx, targetID, fullPage)
|
|
if err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("screenshot failed: %v", err))
|
|
}
|
|
|
|
// Save to workspace/screenshots/ so the agent can access the file.
|
|
// Falls back to os.TempDir() if workspace is not available.
|
|
screenshotDir := filepath.Join(os.TempDir(), "goclaw_screenshots")
|
|
if ws := tools.ToolWorkspaceFromCtx(ctx); ws != "" {
|
|
screenshotDir = filepath.Join(ws, "screenshots")
|
|
}
|
|
if err := os.MkdirAll(screenshotDir, 0755); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("failed to create screenshots directory: %v", err))
|
|
}
|
|
imagePath := filepath.Join(screenshotDir, fmt.Sprintf("screenshot_%d.png", time.Now().UnixNano()))
|
|
if err := os.WriteFile(imagePath, data, 0644); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("failed to save screenshot: %v", err))
|
|
}
|
|
|
|
return &tools.Result{ForLLM: fmt.Sprintf("MEDIA:%s", imagePath)}
|
|
}
|
|
|
|
func (t *BrowserTool) handleNavigate(ctx context.Context, args map[string]any) *tools.Result {
|
|
targetID, _ := args["targetId"].(string)
|
|
url, _ := args["targetUrl"].(string)
|
|
if url == "" {
|
|
return tools.ErrorResult("targetUrl is required for navigate action")
|
|
}
|
|
|
|
if err := t.manager.Navigate(ctx, targetID, url); err != nil {
|
|
return tools.ErrorResult(err.Error())
|
|
}
|
|
return tools.NewResult(fmt.Sprintf("Navigated to %s", url))
|
|
}
|
|
|
|
func (t *BrowserTool) handleConsole(ctx context.Context, args map[string]any) *tools.Result {
|
|
targetID, _ := args["targetId"].(string)
|
|
msgs := t.manager.ConsoleMessages(ctx, targetID)
|
|
return jsonResult(msgs)
|
|
}
|
|
|
|
func (t *BrowserTool) handleAct(ctx context.Context, args map[string]any) *tools.Result {
|
|
req, ok := args["request"].(map[string]any)
|
|
if !ok {
|
|
return tools.ErrorResult("request object is required for act action")
|
|
}
|
|
|
|
kind, _ := req["kind"].(string)
|
|
if kind == "" {
|
|
return tools.ErrorResult("request.kind is required")
|
|
}
|
|
|
|
targetID, _ := args["targetId"].(string)
|
|
|
|
switch kind {
|
|
case "click":
|
|
ref, _ := req["ref"].(string)
|
|
if ref == "" {
|
|
return tools.ErrorResult("request.ref is required for click")
|
|
}
|
|
opts := ClickOpts{}
|
|
if dc, ok := req["doubleClick"].(bool); ok {
|
|
opts.DoubleClick = dc
|
|
}
|
|
if btn, ok := req["button"].(string); ok {
|
|
opts.Button = btn
|
|
}
|
|
if err := t.manager.Click(ctx, targetID, ref, opts); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("click failed: %v", err))
|
|
}
|
|
return tools.NewResult("Clicked successfully.")
|
|
|
|
case "type":
|
|
ref, _ := req["ref"].(string)
|
|
if ref == "" {
|
|
return tools.ErrorResult("request.ref is required for type")
|
|
}
|
|
text, _ := req["text"].(string)
|
|
opts := TypeOpts{}
|
|
if sub, ok := req["submit"].(bool); ok {
|
|
opts.Submit = sub
|
|
}
|
|
if sl, ok := req["slowly"].(bool); ok {
|
|
opts.Slowly = sl
|
|
}
|
|
if err := t.manager.Type(ctx, targetID, ref, text, opts); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("type failed: %v", err))
|
|
}
|
|
return tools.NewResult("Typed successfully.")
|
|
|
|
case "press":
|
|
key, _ := req["key"].(string)
|
|
if key == "" {
|
|
return tools.ErrorResult("request.key is required for press")
|
|
}
|
|
if err := t.manager.Press(ctx, targetID, key); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("press failed: %v", err))
|
|
}
|
|
return tools.NewResult(fmt.Sprintf("Pressed %s.", key))
|
|
|
|
case "hover":
|
|
ref, _ := req["ref"].(string)
|
|
if ref == "" {
|
|
return tools.ErrorResult("request.ref is required for hover")
|
|
}
|
|
if err := t.manager.Hover(ctx, targetID, ref); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("hover failed: %v", err))
|
|
}
|
|
return tools.NewResult("Hovered successfully.")
|
|
|
|
case "wait":
|
|
opts := WaitOpts{}
|
|
if ms, ok := req["timeMs"].(float64); ok {
|
|
opts.TimeMs = int(ms)
|
|
}
|
|
if txt, ok := req["text"].(string); ok {
|
|
opts.Text = txt
|
|
}
|
|
if tg, ok := req["textGone"].(string); ok {
|
|
opts.TextGone = tg
|
|
}
|
|
if u, ok := req["url"].(string); ok {
|
|
opts.URL = u
|
|
}
|
|
if fn, ok := req["fn"].(string); ok {
|
|
opts.Fn = fn
|
|
}
|
|
if err := t.manager.Wait(ctx, targetID, opts); err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("wait failed: %v", err))
|
|
}
|
|
return tools.NewResult("Wait condition met.")
|
|
|
|
case "evaluate":
|
|
fn, _ := req["fn"].(string)
|
|
if fn == "" {
|
|
return tools.ErrorResult("request.fn is required for evaluate")
|
|
}
|
|
result, err := t.manager.Evaluate(ctx, targetID, fn)
|
|
if err != nil {
|
|
return tools.ErrorResult(fmt.Sprintf("evaluate failed: %v", err))
|
|
}
|
|
return tools.NewResult(result)
|
|
|
|
default:
|
|
return tools.ErrorResult(fmt.Sprintf("unknown act kind: %s", kind))
|
|
}
|
|
}
|
|
|
|
func jsonResult(v any) *tools.Result {
|
|
data, _ := json.MarshalIndent(v, "", " ")
|
|
return tools.NewResult(string(data))
|
|
}
|