mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 08:11:23 +00:00
feat(agent): adaptive tool timing with slow tool notification
Track per-tool execution time statistics in session metadata. When a tool call exceeds its adaptive threshold (2x historical max, min 120s default), send a direct outbound notification to the user. - ToolTimingMap: parse/serialize/record/threshold from session metadata - StartSlowTimer: fires once per tool call, auto-cancels on completion - Team config: slow_tool toggle (default on, always direct, never leader) - UI: toggle in team settings with i18n (en/vi/zh) - Store: add GetSessionMetadata to session store interface
This commit is contained in:
@@ -164,8 +164,12 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
// Auto-resolve team workspace for agents not dispatched via team task.
|
||||
// Lead agents default to team workspace (primary job is team coordination).
|
||||
// Non-lead members keep own workspace; team workspace is accessible via absolute path.
|
||||
// resolvedTeamSettings caches team settings from workspace resolution
|
||||
// to avoid re-querying when checking slow_tool notification config.
|
||||
var resolvedTeamSettings json.RawMessage
|
||||
if req.TeamWorkspace == "" && l.teamStore != nil && l.agentUUID != uuid.Nil {
|
||||
if team, _ := l.teamStore.GetTeamForAgent(ctx, l.agentUUID); team != nil {
|
||||
resolvedTeamSettings = team.Settings
|
||||
// Shared workspace: scope by teamID only. Isolated (default): scope by chatID too.
|
||||
wsChat := req.ChatID
|
||||
if wsChat == "" {
|
||||
@@ -243,6 +247,12 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
l.sessions.SetContextWindow(req.SessionKey, l.contextWindow)
|
||||
}
|
||||
|
||||
// 0b. Load adaptive tool timing from session metadata.
|
||||
toolTiming := ParseToolTiming(l.sessions.GetSessionMetadata(req.SessionKey))
|
||||
|
||||
// Resolve slow_tool notification config from already-loaded team settings (no extra DB query).
|
||||
slowToolEnabled := tools.ParseTeamNotifyConfig(resolvedTeamSettings).SlowTool
|
||||
|
||||
// 1. Build messages from session history
|
||||
history := l.sessions.GetHistory(req.SessionKey)
|
||||
summary := l.sessions.GetSummary(req.SessionKey)
|
||||
@@ -869,6 +879,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
|
||||
toolSpanStart := time.Now().UTC()
|
||||
toolSpanID := l.emitToolSpanStart(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON))
|
||||
|
||||
stopSlowTimer := toolTiming.StartSlowTimer(tc.Name, l.id, req.RunID, slowToolEnabled, emitRun)
|
||||
var result *tools.Result
|
||||
if allowedTools != nil && !allowedTools[tc.Name] {
|
||||
// Attempt lazy activation: deferred MCP tools can be activated on first call
|
||||
@@ -890,9 +902,13 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
if result == nil {
|
||||
result = l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
|
||||
}
|
||||
stopSlowTimer()
|
||||
|
||||
l.emitToolSpanEnd(ctx, toolSpanID, toolSpanStart, result)
|
||||
|
||||
// Record tool execution time for adaptive thresholds.
|
||||
toolTiming.Record(tc.Name, time.Since(toolSpanStart).Milliseconds())
|
||||
|
||||
// Record result for loop detection.
|
||||
loopDetector.recordResult(argsHash, result.ForLLM)
|
||||
|
||||
@@ -1009,6 +1025,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
// Emit running span inside goroutine — goroutine-safe (channel send only).
|
||||
// End is also emitted here to prevent orphans on ctx cancellation.
|
||||
spanID := l.emitToolSpanStart(ctx, spanStart, tc.Name, tc.ID, string(argsJSON))
|
||||
|
||||
stopSlowTimer := toolTiming.StartSlowTimer(tc.Name, l.id, req.RunID, slowToolEnabled, emitRun)
|
||||
var result *tools.Result
|
||||
if allowedTools != nil && !allowedTools[tc.Name] {
|
||||
// Attempt lazy activation for deferred MCP tools.
|
||||
@@ -1030,6 +1048,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
if result == nil {
|
||||
result = l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
|
||||
}
|
||||
stopSlowTimer()
|
||||
l.emitToolSpanEnd(ctx, spanID, spanStart, result)
|
||||
resultCh <- indexedResult{idx: idx, tc: tc, result: result, argsJSON: string(argsJSON), spanStart: spanStart}
|
||||
}(i, tc)
|
||||
@@ -1053,6 +1072,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
// Note: tool span start/end already emitted inside goroutines above.
|
||||
var loopStuck bool
|
||||
for _, r := range collected {
|
||||
// Record tool execution time for adaptive thresholds.
|
||||
toolTiming.Record(r.tc.Name, time.Since(r.spanStart).Milliseconds())
|
||||
|
||||
// Record for loop detection.
|
||||
argsHash := loopDetector.record(r.tc.Name, r.tc.Arguments)
|
||||
@@ -1222,6 +1243,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
|
||||
l.sessions.AddMessage(req.SessionKey, msg)
|
||||
}
|
||||
|
||||
// Persist adaptive tool timing to session metadata.
|
||||
if serialized := toolTiming.Serialize(); serialized != "" {
|
||||
l.sessions.SetSessionMetadata(req.SessionKey, map[string]string{"tool_timing": serialized})
|
||||
}
|
||||
|
||||
// Write session metadata (matching TS session entry updates)
|
||||
l.sessions.UpdateMetadata(req.SessionKey, l.model, l.provider.Name(), req.Channel)
|
||||
l.sessions.AccumulateTokens(req.SessionKey, int64(totalUsage.PromptTokens), int64(totalUsage.CompletionTokens))
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/nextlevelbuilder/goclaw/pkg/protocol"
|
||||
)
|
||||
|
||||
// defaultSlowToolThreshold is used when no historical data is available for a tool.
|
||||
const defaultSlowToolThreshold = 120 * time.Second
|
||||
|
||||
// toolTimingMultiplier determines how much slower than the historical max
|
||||
// a tool call must be before it's considered abnormally slow.
|
||||
const toolTimingMultiplier = 2.0
|
||||
|
||||
// minTimingSamples is the minimum number of samples needed before using
|
||||
// adaptive thresholds instead of the default.
|
||||
const minTimingSamples = 3
|
||||
|
||||
// ToolTimingStat tracks execution time statistics for a single tool.
|
||||
type ToolTimingStat struct {
|
||||
Min int64 `json:"min"` // minimum duration in ms
|
||||
Max int64 `json:"max"` // maximum duration in ms
|
||||
Sum int64 `json:"sum"` // total duration in ms (for avg calculation)
|
||||
Count int `json:"n"` // number of samples
|
||||
}
|
||||
|
||||
// ToolTimingMap maps tool names to their timing statistics.
|
||||
// Concurrency contract: SlowThreshold (read) may be called from goroutines,
|
||||
// but Record (write) must only be called sequentially after parallel tools complete.
|
||||
type ToolTimingMap map[string]*ToolTimingStat
|
||||
|
||||
// ParseToolTiming reads tool timing data from session metadata.
|
||||
// Returns an empty map if the key is missing or malformed.
|
||||
func ParseToolTiming(metadata map[string]string) ToolTimingMap {
|
||||
raw, ok := metadata["tool_timing"]
|
||||
if !ok || raw == "" {
|
||||
return make(ToolTimingMap)
|
||||
}
|
||||
var m ToolTimingMap
|
||||
if err := json.Unmarshal([]byte(raw), &m); err != nil {
|
||||
return make(ToolTimingMap)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Serialize returns the JSON string for storage in session metadata.
|
||||
func (m ToolTimingMap) Serialize() string {
|
||||
if len(m) == 0 {
|
||||
return ""
|
||||
}
|
||||
data, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
// Record adds a new timing sample for the given tool.
|
||||
func (m ToolTimingMap) Record(toolName string, durationMs int64) {
|
||||
stat, ok := m[toolName]
|
||||
if !ok {
|
||||
m[toolName] = &ToolTimingStat{
|
||||
Min: durationMs,
|
||||
Max: durationMs,
|
||||
Sum: durationMs,
|
||||
Count: 1,
|
||||
}
|
||||
return
|
||||
}
|
||||
if durationMs < stat.Min {
|
||||
stat.Min = durationMs
|
||||
}
|
||||
if durationMs > stat.Max {
|
||||
stat.Max = durationMs
|
||||
}
|
||||
stat.Sum += durationMs
|
||||
stat.Count++
|
||||
}
|
||||
|
||||
// SlowThreshold returns the duration after which a tool call is considered
|
||||
// abnormally slow. Uses adaptive threshold if enough samples exist,
|
||||
// otherwise falls back to the default.
|
||||
func (m ToolTimingMap) SlowThreshold(toolName string) time.Duration {
|
||||
stat, ok := m[toolName]
|
||||
if !ok || stat.Count < minTimingSamples {
|
||||
return defaultSlowToolThreshold
|
||||
}
|
||||
threshold := time.Duration(float64(stat.Max)*toolTimingMultiplier) * time.Millisecond
|
||||
// Never go below the default — short tools shouldn't trigger on tiny spikes.
|
||||
if threshold < defaultSlowToolThreshold {
|
||||
return defaultSlowToolThreshold
|
||||
}
|
||||
return threshold
|
||||
}
|
||||
|
||||
// StartSlowTimer starts a timer that emits a tool_slow activity event if the
|
||||
// tool call exceeds the adaptive threshold. Returns a stop function that MUST
|
||||
// be called after tool execution to cancel the timer.
|
||||
// If enabled is false, returns a no-op stop function (no timer started).
|
||||
func (m ToolTimingMap) StartSlowTimer(toolName, agentID, runID string, enabled bool, emitRun func(AgentEvent)) func() {
|
||||
if !enabled {
|
||||
return func() {}
|
||||
}
|
||||
threshold := m.SlowThreshold(toolName)
|
||||
timer := time.AfterFunc(threshold, func() {
|
||||
slog.Warn("tool.slow", "agent", agentID, "tool", toolName, "threshold_ms", threshold.Milliseconds())
|
||||
emitRun(AgentEvent{
|
||||
Type: protocol.AgentEventActivity,
|
||||
AgentID: agentID,
|
||||
RunID: runID,
|
||||
Payload: map[string]any{
|
||||
"phase": "tool_slow",
|
||||
"tool": toolName,
|
||||
"threshold_ms": threshold.Milliseconds(),
|
||||
},
|
||||
})
|
||||
})
|
||||
return func() { timer.Stop() }
|
||||
}
|
||||
|
||||
@@ -142,6 +142,17 @@ func (s *PGSessionStore) SetLabel(key, label string) {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PGSessionStore) GetSessionMetadata(key string) map[string]string {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
if data, ok := s.cache[key]; ok && data.Metadata != nil {
|
||||
out := make(map[string]string, len(data.Metadata))
|
||||
maps.Copy(out, data.Metadata)
|
||||
return out
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *PGSessionStore) SetSessionMetadata(key string, metadata map[string]string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
@@ -97,6 +97,7 @@ type SessionStore interface {
|
||||
GetCompactionCount(key string) int
|
||||
GetMemoryFlushCompactionCount(key string) int
|
||||
SetMemoryFlushDone(key string)
|
||||
GetSessionMetadata(key string) map[string]string
|
||||
SetSessionMetadata(key string, metadata map[string]string)
|
||||
SetSpawnInfo(key, spawnedBy string, depth int)
|
||||
SetContextWindow(key string, cw int)
|
||||
|
||||
@@ -8,6 +8,7 @@ type TeamNotifyConfig struct {
|
||||
Progress bool `json:"progress"` // member updates progress
|
||||
Failed bool `json:"failed"` // task failed
|
||||
Completed bool `json:"completed"` // task completed
|
||||
SlowTool bool `json:"slow_tool"` // system alert when tool call exceeds adaptive threshold (always direct, never through leader)
|
||||
Mode string `json:"mode"` // "direct" (outbound) or "leader" (through leader agent)
|
||||
}
|
||||
|
||||
@@ -18,6 +19,7 @@ func DefaultTeamNotifyConfig() TeamNotifyConfig {
|
||||
Progress: true,
|
||||
Failed: true,
|
||||
Completed: true,
|
||||
SlowTool: true,
|
||||
Mode: "direct",
|
||||
}
|
||||
}
|
||||
@@ -35,6 +37,7 @@ func ParseTeamNotifyConfig(settings json.RawMessage) TeamNotifyConfig {
|
||||
Progress *bool `json:"progress"`
|
||||
Failed *bool `json:"failed"`
|
||||
Completed *bool `json:"completed"`
|
||||
SlowTool *bool `json:"slow_tool"`
|
||||
Mode string `json:"mode"`
|
||||
} `json:"notifications"`
|
||||
}
|
||||
@@ -54,6 +57,9 @@ func ParseTeamNotifyConfig(settings json.RawMessage) TeamNotifyConfig {
|
||||
if n.Completed != nil {
|
||||
cfg.Completed = *n.Completed
|
||||
}
|
||||
if n.SlowTool != nil {
|
||||
cfg.SlowTool = *n.SlowTool
|
||||
}
|
||||
if n.Mode == "leader" {
|
||||
cfg.Mode = "leader"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user