Files
goclaw/internal/tracing/snapshot_worker.go
T
Viet Tran 73389d2715 fix(ui): align usage data contracts, add timezone setting, and fix empty usage page (#146)
- Fix 6 data contract mismatches between Go backend JSON tags and React
  frontend TypeScript interfaces (field renames, response envelope changes)
- Add timezone selector to topbar with 12 common timezone options
- Replace date-fns formatting with native Intl.DateTimeFormat for
  timezone-aware chart labels (reduces bundle ~20KB)
- Add missing SnapshotTimeSeries fields (memory_docs, memory_chunks,
  kg_entities, kg_relations) that caused empty usage page
- Add error banner to usage page for API error visibility
- Sanitize backend error messages in usage HTTP handlers
- Add batch chunking (max 3000 rows) for snapshot upserts
- Remove userId display from topbar
- Add usage analytics i18n strings for en/vi/zh
2026-03-11 14:22:03 +07:00

488 lines
13 KiB
Go

package tracing
import (
"context"
"database/sql"
"fmt"
"log/slog"
"sync"
"time"
"github.com/google/uuid"
"github.com/nextlevelbuilder/goclaw/internal/store"
)
// SnapshotWorker periodically aggregates trace/span data into usage_snapshots.
type SnapshotWorker struct {
db *sql.DB
snapshots store.SnapshotStore
stopCh chan struct{}
wg sync.WaitGroup
}
func NewSnapshotWorker(db *sql.DB, snapshots store.SnapshotStore) *SnapshotWorker {
return &SnapshotWorker{
db: db,
snapshots: snapshots,
stopCh: make(chan struct{}),
}
}
// Start launches the background aggregation loop.
func (w *SnapshotWorker) Start() {
w.wg.Add(1)
go w.loop()
slog.Info("snapshot worker started")
}
// Stop signals the worker to stop and waits for completion.
func (w *SnapshotWorker) Stop() {
close(w.stopCh)
w.wg.Wait()
slog.Info("snapshot worker stopped")
}
func (w *SnapshotWorker) loop() {
defer w.wg.Done()
// On startup, catch up any missed hours
w.catchUp()
// Tick at HH:05:00 UTC (5 min past the hour)
now := time.Now().UTC()
nextTick := now.Truncate(time.Hour).Add(time.Hour).Add(5 * time.Minute)
if now.After(nextTick) {
nextTick = nextTick.Add(time.Hour)
}
timer := time.NewTimer(time.Until(nextTick))
defer timer.Stop()
for {
select {
case <-w.stopCh:
return
case <-timer.C:
w.catchUp()
// Reset for next hour
nextTick = nextTick.Add(time.Hour)
timer.Reset(time.Until(nextTick))
}
}
}
// catchUp computes snapshots for all missed hours between latest bucket and current hour.
func (w *SnapshotWorker) catchUp() {
ctx := context.Background()
now := time.Now().UTC()
targetHour := now.Truncate(time.Hour).Add(-time.Hour) // previous complete hour
latest, err := w.snapshots.GetLatestBucket(ctx)
if err != nil {
slog.Warn("snapshot: get latest bucket", "error", err)
return
}
var startHour time.Time
if latest == nil {
// No snapshots yet — only compute the previous hour (backfill handles history)
startHour = targetHour
} else {
startHour = latest.Add(time.Hour)
}
for h := startHour; !h.After(targetHour); h = h.Add(time.Hour) {
start := time.Now()
if err := w.aggregateHour(ctx, h); err != nil {
slog.Warn("snapshot: aggregate hour failed", "hour", h.Format(time.RFC3339), "error", err)
return // stop catch-up on error, will retry next tick
}
slog.Info("snapshot computed", "hour", h.Format(time.RFC3339), "duration_ms", time.Since(start).Milliseconds())
}
}
// Backfill populates usage_snapshots from historical trace/span data.
// Returns the number of hours processed.
func (w *SnapshotWorker) Backfill(ctx context.Context) (int, error) {
latest, err := w.snapshots.GetLatestBucket(ctx)
if err != nil {
return 0, fmt.Errorf("get latest bucket: %w", err)
}
// Find earliest root trace
var earliest sql.NullTime
err = w.db.QueryRowContext(ctx,
`SELECT MIN(start_time) FROM traces WHERE parent_trace_id IS NULL`,
).Scan(&earliest)
if err != nil || !earliest.Valid {
return 0, nil // no traces to backfill
}
startHour := earliest.Time.UTC().Truncate(time.Hour)
if latest != nil {
startHour = latest.Add(time.Hour)
}
endHour := time.Now().UTC().Truncate(time.Hour)
count := 0
for h := startHour; h.Before(endHour); h = h.Add(time.Hour) {
if err := w.aggregateHour(ctx, h); err != nil {
slog.Warn("backfill: aggregate hour failed", "hour", h.Format(time.RFC3339), "error", err)
continue
}
count++
}
return count, nil
}
func (w *SnapshotWorker) aggregateHour(ctx context.Context, bucketStart time.Time) error {
bucketEnd := bucketStart.Add(time.Hour)
// Query 1: trace-level metrics by (agent_id, channel)
traceRows, err := queryTraceAggregates(ctx, w.db, bucketStart, bucketEnd)
if err != nil {
return fmt.Errorf("trace aggregates: %w", err)
}
// Query 2: span-level metrics by (agent_id, channel, provider, model)
spanRows, err := querySpanAggregates(ctx, w.db, bucketStart, bucketEnd)
if err != nil {
return fmt.Errorf("span aggregates: %w", err)
}
// Memory & KG point-in-time counts
memoryCounts, err := queryMemoryCounts(ctx, w.db)
if err != nil {
slog.Warn("snapshot: memory counts failed, continuing without", "error", err)
memoryCounts = nil
}
kgCounts, err := queryKGCounts(ctx, w.db)
if err != nil {
slog.Warn("snapshot: kg counts failed, continuing without", "error", err)
kgCounts = nil
}
// Merge into UsageSnapshot rows
snapshots := mergeTraceAndSpanRows(bucketStart, traceRows, spanRows, memoryCounts, kgCounts)
if len(snapshots) == 0 {
return nil
}
return w.snapshots.UpsertSnapshots(ctx, snapshots)
}
// traceAggregate holds trace-level metrics for one (agent_id, channel) group.
type traceAggregate struct {
AgentID *uuid.UUID
Channel string
RequestCount int
ErrorCount int
UniqueUsers int
InputTokens int64
OutputTokens int64
TotalCost float64
ToolCallCount int
AvgDurationMS int
}
func queryTraceAggregates(ctx context.Context, db *sql.DB, from, to time.Time) ([]traceAggregate, error) {
rows, err := db.QueryContext(ctx, `
SELECT
agent_id,
COALESCE(channel, '') as channel,
COUNT(*) as request_count,
COUNT(*) FILTER (WHERE status = 'error') as error_count,
COUNT(DISTINCT user_id) as unique_users,
COALESCE(SUM(total_input_tokens), 0) as input_tokens,
COALESCE(SUM(total_output_tokens), 0) as output_tokens,
COALESCE(SUM(total_cost), 0) as total_cost,
COALESCE(SUM(tool_call_count), 0) as tool_call_count,
COALESCE(AVG(duration_ms), 0)::INTEGER as avg_duration_ms
FROM traces
WHERE start_time >= $1 AND start_time < $2
AND parent_trace_id IS NULL
GROUP BY agent_id, channel`, from, to)
if err != nil {
return nil, err
}
defer rows.Close()
var result []traceAggregate
for rows.Next() {
var ta traceAggregate
if err := rows.Scan(
&ta.AgentID, &ta.Channel,
&ta.RequestCount, &ta.ErrorCount, &ta.UniqueUsers,
&ta.InputTokens, &ta.OutputTokens, &ta.TotalCost,
&ta.ToolCallCount, &ta.AvgDurationMS,
); err != nil {
return nil, err
}
result = append(result, ta)
}
return result, rows.Err()
}
// spanAggregate holds span-level LLM metrics for one (agent_id, channel, provider, model) group.
type spanAggregate struct {
AgentID *uuid.UUID
Channel string
Provider string
Model string
LLMCallCount int
InputTokens int64
OutputTokens int64
TotalCost float64
CacheReadTokens int64
CacheCreateTokens int64
ThinkingTokens int64
}
func querySpanAggregates(ctx context.Context, db *sql.DB, from, to time.Time) ([]spanAggregate, error) {
rows, err := db.QueryContext(ctx, `
SELECT
t.agent_id,
COALESCE(t.channel, '') as channel,
COALESCE(s.provider, '') as provider,
COALESCE(s.model, '') as model,
COUNT(*) as llm_call_count,
COALESCE(SUM(s.input_tokens), 0) as span_input_tokens,
COALESCE(SUM(s.output_tokens), 0) as span_output_tokens,
COALESCE(SUM(s.total_cost), 0) as span_cost,
COALESCE(SUM((s.metadata->>'cache_read_tokens')::BIGINT), 0) as cache_read_tokens,
COALESCE(SUM((s.metadata->>'cache_creation_tokens')::BIGINT), 0) as cache_create_tokens,
COALESCE(SUM((s.metadata->>'thinking_tokens')::BIGINT), 0) as thinking_tokens
FROM traces t
JOIN spans s ON s.trace_id = t.id AND s.span_type = 'llm_call'
WHERE t.start_time >= $1 AND t.start_time < $2
AND t.parent_trace_id IS NULL
GROUP BY t.agent_id, t.channel, s.provider, s.model`, from, to)
if err != nil {
return nil, err
}
defer rows.Close()
var result []spanAggregate
for rows.Next() {
var sa spanAggregate
if err := rows.Scan(
&sa.AgentID, &sa.Channel,
&sa.Provider, &sa.Model,
&sa.LLMCallCount,
&sa.InputTokens, &sa.OutputTokens, &sa.TotalCost,
&sa.CacheReadTokens, &sa.CacheCreateTokens, &sa.ThinkingTokens,
); err != nil {
return nil, err
}
result = append(result, sa)
}
return result, rows.Err()
}
// agentMemoryCounts holds point-in-time memory counts for one agent.
type agentMemoryCounts struct {
AgentID uuid.UUID
Docs int
Chunks int
}
// agentKGCounts holds point-in-time KG counts for one agent.
type agentKGCounts struct {
AgentID uuid.UUID
Entities int
Relations int
}
func queryMemoryCounts(ctx context.Context, db *sql.DB) (map[uuid.UUID]agentMemoryCounts, error) {
result := make(map[uuid.UUID]agentMemoryCounts)
// Document counts
rows, err := db.QueryContext(ctx, `SELECT agent_id, COUNT(*) FROM memory_documents GROUP BY agent_id`)
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var agentID uuid.UUID
var count int
if err := rows.Scan(&agentID, &count); err != nil {
return nil, err
}
mc := result[agentID]
mc.AgentID = agentID
mc.Docs = count
result[agentID] = mc
}
if err := rows.Err(); err != nil {
return nil, err
}
// Chunk counts
rows2, err := db.QueryContext(ctx, `SELECT agent_id, COUNT(*) FROM memory_chunks GROUP BY agent_id`)
if err != nil {
return nil, err
}
defer rows2.Close()
for rows2.Next() {
var agentID uuid.UUID
var count int
if err := rows2.Scan(&agentID, &count); err != nil {
return nil, err
}
mc := result[agentID]
mc.AgentID = agentID
mc.Chunks = count
result[agentID] = mc
}
return result, rows2.Err()
}
func queryKGCounts(ctx context.Context, db *sql.DB) (map[uuid.UUID]agentKGCounts, error) {
result := make(map[uuid.UUID]agentKGCounts)
rows, err := db.QueryContext(ctx, `SELECT agent_id, COUNT(*) FROM kg_entities GROUP BY agent_id`)
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var agentID uuid.UUID
var count int
if err := rows.Scan(&agentID, &count); err != nil {
return nil, err
}
kc := result[agentID]
kc.AgentID = agentID
kc.Entities = count
result[agentID] = kc
}
if err := rows.Err(); err != nil {
return nil, err
}
rows2, err := db.QueryContext(ctx, `SELECT agent_id, COUNT(*) FROM kg_relations GROUP BY agent_id`)
if err != nil {
return nil, err
}
defer rows2.Close()
for rows2.Next() {
var agentID uuid.UUID
var count int
if err := rows2.Scan(&agentID, &count); err != nil {
return nil, err
}
kc := result[agentID]
kc.AgentID = agentID
kc.Relations = count
result[agentID] = kc
}
return result, rows2.Err()
}
// agentChannelKey is a composite key for the merge map.
type agentChannelKey struct {
AgentID uuid.UUID // zero UUID for nil agent_id
Channel string
}
func mergeTraceAndSpanRows(
bucketStart time.Time,
traceRows []traceAggregate,
spanRows []spanAggregate,
memoryCounts map[uuid.UUID]agentMemoryCounts,
kgCounts map[uuid.UUID]agentKGCounts,
) []store.UsageSnapshot {
var snapshots []store.UsageSnapshot
seenAgents := make(map[agentChannelKey]bool)
// 1. Create "totals" rows from trace data (provider='', model='')
for _, tr := range traceRows {
key := agentChannelKey{Channel: tr.Channel}
if tr.AgentID != nil {
key.AgentID = *tr.AgentID
}
seenAgents[key] = true
snap := store.UsageSnapshot{
BucketHour: bucketStart,
AgentID: tr.AgentID,
Provider: "",
Model: "",
Channel: tr.Channel,
RequestCount: tr.RequestCount,
ErrorCount: tr.ErrorCount,
UniqueUsers: tr.UniqueUsers,
ToolCallCount: tr.ToolCallCount,
AvgDurationMS: tr.AvgDurationMS,
}
// Attach memory/KG counts to totals row
if tr.AgentID != nil {
if mc, ok := memoryCounts[*tr.AgentID]; ok {
snap.MemoryDocs = mc.Docs
snap.MemoryChunks = mc.Chunks
}
if kc, ok := kgCounts[*tr.AgentID]; ok {
snap.KGEntities = kc.Entities
snap.KGRelations = kc.Relations
}
}
snapshots = append(snapshots, snap)
}
// 2. Create detail rows from span data (with actual provider/model)
for _, sp := range spanRows {
snapshots = append(snapshots, store.UsageSnapshot{
BucketHour: bucketStart,
AgentID: sp.AgentID,
Provider: sp.Provider,
Model: sp.Model,
Channel: sp.Channel,
LLMCallCount: sp.LLMCallCount,
InputTokens: sp.InputTokens,
OutputTokens: sp.OutputTokens,
TotalCost: sp.TotalCost,
CacheReadTokens: sp.CacheReadTokens,
CacheCreateTokens: sp.CacheCreateTokens,
ThinkingTokens: sp.ThinkingTokens,
})
}
// 3. Create memory/KG-only totals rows for agents without traces this hour
if memoryCounts != nil || kgCounts != nil {
allAgents := make(map[uuid.UUID]bool)
for id := range memoryCounts {
allAgents[id] = true
}
for id := range kgCounts {
allAgents[id] = true
}
for agentID := range allAgents {
key := agentChannelKey{AgentID: agentID}
if seenAgents[key] {
continue
}
aid := agentID
snap := store.UsageSnapshot{
BucketHour: bucketStart,
AgentID: &aid,
Provider: "",
Model: "",
Channel: "",
}
if mc, ok := memoryCounts[agentID]; ok {
snap.MemoryDocs = mc.Docs
snap.MemoryChunks = mc.Chunks
}
if kc, ok := kgCounts[agentID]; ok {
snap.KGEntities = kc.Entities
snap.KGRelations = kc.Relations
}
snapshots = append(snapshots, snap)
}
}
return snapshots
}