feat(agent): adaptive tool timing with slow tool notification

Track per-tool execution time statistics in session metadata. When a tool
call exceeds its adaptive threshold (2x historical max, min 120s default),
send a direct outbound notification to the user.

- ToolTimingMap: parse/serialize/record/threshold from session metadata
- StartSlowTimer: fires once per tool call, auto-cancels on completion
- Team config: slow_tool toggle (default on, always direct, never leader)
- UI: toggle in team settings with i18n (en/vi/zh)
- Store: add GetSessionMetadata to session store interface
This commit is contained in:
viettranx
2026-03-19 11:17:32 +07:00
parent 0df619023c
commit 4e9f155a4c
12 changed files with 236 additions and 1 deletions
+26
View File
@@ -164,8 +164,12 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
// Auto-resolve team workspace for agents not dispatched via team task.
// Lead agents default to team workspace (primary job is team coordination).
// Non-lead members keep own workspace; team workspace is accessible via absolute path.
// resolvedTeamSettings caches team settings from workspace resolution
// to avoid re-querying when checking slow_tool notification config.
var resolvedTeamSettings json.RawMessage
if req.TeamWorkspace == "" && l.teamStore != nil && l.agentUUID != uuid.Nil {
if team, _ := l.teamStore.GetTeamForAgent(ctx, l.agentUUID); team != nil {
resolvedTeamSettings = team.Settings
// Shared workspace: scope by teamID only. Isolated (default): scope by chatID too.
wsChat := req.ChatID
if wsChat == "" {
@@ -243,6 +247,12 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
l.sessions.SetContextWindow(req.SessionKey, l.contextWindow)
}
// 0b. Load adaptive tool timing from session metadata.
toolTiming := ParseToolTiming(l.sessions.GetSessionMetadata(req.SessionKey))
// Resolve slow_tool notification config from already-loaded team settings (no extra DB query).
slowToolEnabled := tools.ParseTeamNotifyConfig(resolvedTeamSettings).SlowTool
// 1. Build messages from session history
history := l.sessions.GetHistory(req.SessionKey)
summary := l.sessions.GetSummary(req.SessionKey)
@@ -869,6 +879,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
toolSpanStart := time.Now().UTC()
toolSpanID := l.emitToolSpanStart(ctx, toolSpanStart, tc.Name, tc.ID, string(argsJSON))
stopSlowTimer := toolTiming.StartSlowTimer(tc.Name, l.id, req.RunID, slowToolEnabled, emitRun)
var result *tools.Result
if allowedTools != nil && !allowedTools[tc.Name] {
// Attempt lazy activation: deferred MCP tools can be activated on first call
@@ -890,9 +902,13 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
if result == nil {
result = l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
}
stopSlowTimer()
l.emitToolSpanEnd(ctx, toolSpanID, toolSpanStart, result)
// Record tool execution time for adaptive thresholds.
toolTiming.Record(tc.Name, time.Since(toolSpanStart).Milliseconds())
// Record result for loop detection.
loopDetector.recordResult(argsHash, result.ForLLM)
@@ -1009,6 +1025,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
// Emit running span inside goroutine — goroutine-safe (channel send only).
// End is also emitted here to prevent orphans on ctx cancellation.
spanID := l.emitToolSpanStart(ctx, spanStart, tc.Name, tc.ID, string(argsJSON))
stopSlowTimer := toolTiming.StartSlowTimer(tc.Name, l.id, req.RunID, slowToolEnabled, emitRun)
var result *tools.Result
if allowedTools != nil && !allowedTools[tc.Name] {
// Attempt lazy activation for deferred MCP tools.
@@ -1030,6 +1048,7 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
if result == nil {
result = l.tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, req.Channel, req.ChatID, req.PeerKind, req.SessionKey, nil)
}
stopSlowTimer()
l.emitToolSpanEnd(ctx, spanID, spanStart, result)
resultCh <- indexedResult{idx: idx, tc: tc, result: result, argsJSON: string(argsJSON), spanStart: spanStart}
}(i, tc)
@@ -1053,6 +1072,8 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
// Note: tool span start/end already emitted inside goroutines above.
var loopStuck bool
for _, r := range collected {
// Record tool execution time for adaptive thresholds.
toolTiming.Record(r.tc.Name, time.Since(r.spanStart).Milliseconds())
// Record for loop detection.
argsHash := loopDetector.record(r.tc.Name, r.tc.Arguments)
@@ -1222,6 +1243,11 @@ func (l *Loop) runLoop(ctx context.Context, req RunRequest) (*RunResult, error)
l.sessions.AddMessage(req.SessionKey, msg)
}
// Persist adaptive tool timing to session metadata.
if serialized := toolTiming.Serialize(); serialized != "" {
l.sessions.SetSessionMetadata(req.SessionKey, map[string]string{"tool_timing": serialized})
}
// Write session metadata (matching TS session entry updates)
l.sessions.UpdateMetadata(req.SessionKey, l.model, l.provider.Name(), req.Channel)
l.sessions.AccumulateTokens(req.SessionKey, int64(totalUsage.PromptTokens), int64(totalUsage.CompletionTokens))
+123
View File
@@ -0,0 +1,123 @@
package agent
import (
"encoding/json"
"log/slog"
"time"
"github.com/nextlevelbuilder/goclaw/pkg/protocol"
)
// defaultSlowToolThreshold is used when no historical data is available for a tool.
const defaultSlowToolThreshold = 120 * time.Second
// toolTimingMultiplier determines how much slower than the historical max
// a tool call must be before it's considered abnormally slow.
const toolTimingMultiplier = 2.0
// minTimingSamples is the minimum number of samples needed before using
// adaptive thresholds instead of the default.
const minTimingSamples = 3
// ToolTimingStat tracks execution time statistics for a single tool.
type ToolTimingStat struct {
Min int64 `json:"min"` // minimum duration in ms
Max int64 `json:"max"` // maximum duration in ms
Sum int64 `json:"sum"` // total duration in ms (for avg calculation)
Count int `json:"n"` // number of samples
}
// ToolTimingMap maps tool names to their timing statistics.
// Concurrency contract: SlowThreshold (read) may be called from goroutines,
// but Record (write) must only be called sequentially after parallel tools complete.
type ToolTimingMap map[string]*ToolTimingStat
// ParseToolTiming reads tool timing data from session metadata.
// Returns an empty map if the key is missing or malformed.
func ParseToolTiming(metadata map[string]string) ToolTimingMap {
raw, ok := metadata["tool_timing"]
if !ok || raw == "" {
return make(ToolTimingMap)
}
var m ToolTimingMap
if err := json.Unmarshal([]byte(raw), &m); err != nil {
return make(ToolTimingMap)
}
return m
}
// Serialize returns the JSON string for storage in session metadata.
func (m ToolTimingMap) Serialize() string {
if len(m) == 0 {
return ""
}
data, err := json.Marshal(m)
if err != nil {
return ""
}
return string(data)
}
// Record adds a new timing sample for the given tool.
func (m ToolTimingMap) Record(toolName string, durationMs int64) {
stat, ok := m[toolName]
if !ok {
m[toolName] = &ToolTimingStat{
Min: durationMs,
Max: durationMs,
Sum: durationMs,
Count: 1,
}
return
}
if durationMs < stat.Min {
stat.Min = durationMs
}
if durationMs > stat.Max {
stat.Max = durationMs
}
stat.Sum += durationMs
stat.Count++
}
// SlowThreshold returns the duration after which a tool call is considered
// abnormally slow. Uses adaptive threshold if enough samples exist,
// otherwise falls back to the default.
func (m ToolTimingMap) SlowThreshold(toolName string) time.Duration {
stat, ok := m[toolName]
if !ok || stat.Count < minTimingSamples {
return defaultSlowToolThreshold
}
threshold := time.Duration(float64(stat.Max)*toolTimingMultiplier) * time.Millisecond
// Never go below the default — short tools shouldn't trigger on tiny spikes.
if threshold < defaultSlowToolThreshold {
return defaultSlowToolThreshold
}
return threshold
}
// StartSlowTimer starts a timer that emits a tool_slow activity event if the
// tool call exceeds the adaptive threshold. Returns a stop function that MUST
// be called after tool execution to cancel the timer.
// If enabled is false, returns a no-op stop function (no timer started).
func (m ToolTimingMap) StartSlowTimer(toolName, agentID, runID string, enabled bool, emitRun func(AgentEvent)) func() {
if !enabled {
return func() {}
}
threshold := m.SlowThreshold(toolName)
timer := time.AfterFunc(threshold, func() {
slog.Warn("tool.slow", "agent", agentID, "tool", toolName, "threshold_ms", threshold.Milliseconds())
emitRun(AgentEvent{
Type: protocol.AgentEventActivity,
AgentID: agentID,
RunID: runID,
Payload: map[string]any{
"phase": "tool_slow",
"tool": toolName,
"threshold_ms": threshold.Milliseconds(),
},
})
})
return func() { timer.Stop() }
}
+11
View File
@@ -142,6 +142,17 @@ func (s *PGSessionStore) SetLabel(key, label string) {
}
}
func (s *PGSessionStore) GetSessionMetadata(key string) map[string]string {
s.mu.RLock()
defer s.mu.RUnlock()
if data, ok := s.cache[key]; ok && data.Metadata != nil {
out := make(map[string]string, len(data.Metadata))
maps.Copy(out, data.Metadata)
return out
}
return nil
}
func (s *PGSessionStore) SetSessionMetadata(key string, metadata map[string]string) {
s.mu.Lock()
defer s.mu.Unlock()
+1
View File
@@ -97,6 +97,7 @@ type SessionStore interface {
GetCompactionCount(key string) int
GetMemoryFlushCompactionCount(key string) int
SetMemoryFlushDone(key string)
GetSessionMetadata(key string) map[string]string
SetSessionMetadata(key string, metadata map[string]string)
SetSpawnInfo(key, spawnedBy string, depth int)
SetContextWindow(key string, cw int)
+6
View File
@@ -8,6 +8,7 @@ type TeamNotifyConfig struct {
Progress bool `json:"progress"` // member updates progress
Failed bool `json:"failed"` // task failed
Completed bool `json:"completed"` // task completed
SlowTool bool `json:"slow_tool"` // system alert when tool call exceeds adaptive threshold (always direct, never through leader)
Mode string `json:"mode"` // "direct" (outbound) or "leader" (through leader agent)
}
@@ -18,6 +19,7 @@ func DefaultTeamNotifyConfig() TeamNotifyConfig {
Progress: true,
Failed: true,
Completed: true,
SlowTool: true,
Mode: "direct",
}
}
@@ -35,6 +37,7 @@ func ParseTeamNotifyConfig(settings json.RawMessage) TeamNotifyConfig {
Progress *bool `json:"progress"`
Failed *bool `json:"failed"`
Completed *bool `json:"completed"`
SlowTool *bool `json:"slow_tool"`
Mode string `json:"mode"`
} `json:"notifications"`
}
@@ -54,6 +57,9 @@ func ParseTeamNotifyConfig(settings json.RawMessage) TeamNotifyConfig {
if n.Completed != nil {
cfg.Completed = *n.Completed
}
if n.SlowTool != nil {
cfg.SlowTool = *n.SlowTool
}
if n.Mode == "leader" {
cfg.Mode = "leader"
}