Files
goclaw/internal/cron/service_execution.go
T
viettranx 0655849d3d fix(cron): route jobs through scheduler for concurrency control and parallel execution
- Simplify cron session key to `agent:{agentId}:cron:{jobID}` (remove redundant `:run:{runID}`)
- Route cron jobs through scheduler's cron lane instead of calling loop.Run() directly
- Scheduler enforces per-session maxConcurrent=1, preventing same job from running concurrently
- Parallelize due job execution with goroutines + WaitGroup (PG and file store)
- Move scheduler creation before cron setup in gateway.go initialization order

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 16:53:15 +07:00

360 lines
7.8 KiB
Go

package cron
import (
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"sync"
"time"
"github.com/adhocore/gronx"
)
// RunJob manually triggers a job execution.
// mode: "force" = run regardless of schedule, "due" = only run if due.
// Returns (ran bool, reason string, error).
func (cs *Service) RunJob(jobID string, force bool) (bool, string, error) {
cs.mu.Lock()
var job *Job
for i := range cs.store.Jobs {
if cs.store.Jobs[i].ID == jobID {
j := cs.store.Jobs[i] // copy
job = &j
break
}
}
handler := cs.onJob
cs.mu.Unlock()
if job == nil {
return false, "", fmt.Errorf("job %s not found", jobID)
}
if handler == nil {
return false, "", fmt.Errorf("no job handler configured")
}
if !force {
// Check if job is due
if job.State.NextRunAtMS == nil || *job.State.NextRunAtMS > nowMS() {
return false, "not-due", nil
}
}
// Execute outside lock with retry
slog.Info("cron manual run", "id", job.ID, "name", job.Name, "force", force)
result, _, err := ExecuteWithRetry(func() (string, error) {
return handler(job)
}, cs.retryCfg)
// Update state
cs.mu.Lock()
defer cs.mu.Unlock()
for i := range cs.store.Jobs {
if cs.store.Jobs[i].ID != jobID {
continue
}
now := nowMS()
cs.store.Jobs[i].State.LastRunAtMS = &now
if err != nil {
cs.store.Jobs[i].State.LastStatus = "error"
cs.store.Jobs[i].State.LastError = err.Error()
} else {
cs.store.Jobs[i].State.LastStatus = "ok"
cs.store.Jobs[i].State.LastError = ""
}
// Recompute next run (unless one-time and delete after run)
if cs.store.Jobs[i].DeleteAfterRun {
cs.store.Jobs = append(cs.store.Jobs[:i], cs.store.Jobs[i+1:]...)
} else {
next := cs.computeNextRun(&cs.store.Jobs[i].Schedule, now)
cs.store.Jobs[i].State.NextRunAtMS = next
}
cs.saveUnsafe()
break
}
// Record run log
cs.recordRun(jobID, err, result)
if err != nil {
return true, "", err
}
return true, result, nil
}
// GetRunLog returns recent run log entries for a job (or all jobs if jobID is empty).
func (cs *Service) GetRunLog(jobID string, limit int) []RunLogEntry {
cs.mu.Lock()
defer cs.mu.Unlock()
if limit <= 0 {
limit = 20
}
var result []RunLogEntry
for i := len(cs.runLog) - 1; i >= 0 && len(result) < limit; i-- {
entry := cs.runLog[i]
if jobID == "" || entry.JobID == jobID {
result = append(result, entry)
}
}
return result
}
func (cs *Service) recordRun(jobID string, err error, resultText string) {
cs.mu.Lock()
defer cs.mu.Unlock()
entry := RunLogEntry{
Ts: nowMS(),
JobID: jobID,
}
if err != nil {
entry.Status = "error"
entry.Error = err.Error()
} else {
entry.Status = "ok"
entry.Summary = TruncateOutput(resultText)
}
cs.runLog = append(cs.runLog, entry)
// Keep last 200 entries in memory
if len(cs.runLog) > 200 {
cs.runLog = cs.runLog[len(cs.runLog)-200:]
}
}
// --- Internal scheduling loop ---
func (cs *Service) runLoop(stopChan chan struct{}) {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for {
select {
case <-stopChan:
return
case <-ticker.C:
cs.checkJobs()
}
}
}
func (cs *Service) checkJobs() {
cs.mu.Lock()
now := nowMS()
var dueJobIDs []string
for i := range cs.store.Jobs {
job := &cs.store.Jobs[i]
if job.Enabled && job.State.NextRunAtMS != nil && *job.State.NextRunAtMS <= now {
dueJobIDs = append(dueJobIDs, job.ID)
}
}
if len(dueJobIDs) == 0 {
cs.mu.Unlock()
return
}
// Clear NextRunAtMS to prevent duplicate execution
dueMap := make(map[string]bool, len(dueJobIDs))
for _, id := range dueJobIDs {
dueMap[id] = true
}
for i := range cs.store.Jobs {
if dueMap[cs.store.Jobs[i].ID] {
cs.store.Jobs[i].State.NextRunAtMS = nil
}
}
cs.saveUnsafe()
cs.mu.Unlock()
// Execute jobs in parallel — scheduler enforces per-session serialization
var wg sync.WaitGroup
for _, jobID := range dueJobIDs {
wg.Add(1)
go func(id string) {
defer wg.Done()
cs.executeJobByID(id)
}(jobID)
}
wg.Wait()
}
func (cs *Service) executeJobByID(jobID string) {
cs.mu.Lock()
var job *Job
for i := range cs.store.Jobs {
if cs.store.Jobs[i].ID == jobID {
j := cs.store.Jobs[i] // copy
job = &j
break
}
}
handler := cs.onJob
cs.mu.Unlock()
if job == nil || handler == nil {
return
}
slog.Info("cron executing job", "id", job.ID, "name", job.Name)
result, attempts, err := ExecuteWithRetry(func() (string, error) {
return handler(job)
}, cs.retryCfg)
if attempts > 1 {
slog.Info("cron job retried", "id", job.ID, "attempts", attempts, "success", err == nil)
}
cs.mu.Lock()
defer cs.mu.Unlock()
for i := range cs.store.Jobs {
if cs.store.Jobs[i].ID != jobID {
continue
}
now := nowMS()
cs.store.Jobs[i].State.LastRunAtMS = &now
if err != nil {
cs.store.Jobs[i].State.LastStatus = "error"
cs.store.Jobs[i].State.LastError = err.Error()
slog.Error("cron job failed", "id", jobID, "error", err)
} else {
cs.store.Jobs[i].State.LastStatus = "ok"
cs.store.Jobs[i].State.LastError = ""
slog.Info("cron job completed", "id", jobID, "result", result)
}
// Schedule next run or handle one-time jobs
if cs.store.Jobs[i].DeleteAfterRun {
cs.store.Jobs = append(cs.store.Jobs[:i], cs.store.Jobs[i+1:]...)
} else {
next := cs.computeNextRun(&cs.store.Jobs[i].Schedule, now)
cs.store.Jobs[i].State.NextRunAtMS = next
if next == nil {
cs.store.Jobs[i].Enabled = false
}
}
break
}
cs.saveUnsafe()
}
// --- Schedule computation ---
func (cs *Service) computeNextRun(schedule *Schedule, now int64) *int64 {
switch schedule.Kind {
case "at":
if schedule.AtMS != nil && *schedule.AtMS > now {
return schedule.AtMS
}
return nil
case "every":
if schedule.EveryMS == nil || *schedule.EveryMS <= 0 {
return nil
}
next := now + *schedule.EveryMS
return &next
case "cron":
if schedule.Expr == "" {
return nil
}
nowTime := time.UnixMilli(now)
if schedule.TZ != "" {
if loc, err := time.LoadLocation(schedule.TZ); err == nil {
nowTime = nowTime.In(loc)
}
}
nextTime, err := gronx.NextTickAfter(schedule.Expr, nowTime, false)
if err != nil {
slog.Error("cron: failed to compute next run", "expr", schedule.Expr, "error", err)
return nil
}
nextMS := nextTime.UnixMilli()
return &nextMS
default:
return nil
}
}
func (cs *Service) validateSchedule(schedule *Schedule) error {
switch schedule.Kind {
case "at":
if schedule.AtMS == nil {
return fmt.Errorf("at schedule requires atMs")
}
case "every":
if schedule.EveryMS == nil || *schedule.EveryMS <= 0 {
return fmt.Errorf("every schedule requires positive everyMs")
}
case "cron":
if schedule.Expr == "" {
return fmt.Errorf("cron schedule requires expr")
}
gx := gronx.New()
if !gx.IsValid(schedule.Expr) {
return fmt.Errorf("invalid cron expression: %s", schedule.Expr)
}
if schedule.TZ != "" {
if _, err := time.LoadLocation(schedule.TZ); err != nil {
return fmt.Errorf("invalid timezone: %s", schedule.TZ)
}
}
default:
return fmt.Errorf("unknown schedule kind: %s", schedule.Kind)
}
return nil
}
func (cs *Service) getNextWakeMS() *int64 {
var earliest *int64
for _, job := range cs.store.Jobs {
if job.Enabled && job.State.NextRunAtMS != nil {
if earliest == nil || *job.State.NextRunAtMS < *earliest {
earliest = job.State.NextRunAtMS
}
}
}
return earliest
}
// --- Persistence ---
func (cs *Service) loadUnsafe() error {
data, err := os.ReadFile(cs.storePath)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return json.Unmarshal(data, &cs.store)
}
func (cs *Service) saveUnsafe() error {
dir := filepath.Dir(cs.storePath)
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
data, err := json.MarshalIndent(cs.store, "", " ")
if err != nil {
return err
}
return os.WriteFile(cs.storePath, data, 0644)
}