mirror of
https://github.com/tiennm99/coolify.git
synced 2026-04-17 19:21:36 +00:00
feat(jobs): improve scheduled tasks with retry logic and queue cleanup
- Add retry configuration to CoolifyTask (3 tries, 600s timeout) - Add retry configuration to ScheduledTaskJob (3 tries, configurable timeout) - Add retry configuration to DatabaseBackupJob (2 tries) - Implement exponential backoff for all jobs (30s, 60s, 120s intervals) - Add failed() handlers with comprehensive error logging to scheduled-errors channel - Add execution tracking: started_at, retry_count, duration (decimal), error_details - Add configurable timeout field to scheduled tasks (60-3600s, default 300s) - Update UI to include timeout configuration in task creation/editing forms - Increase ScheduledJobManager lock expiration from 60s to 90s for high-load environments - Implement safe queue cleanup with restart vs runtime modes - Restart mode: aggressive cleanup (marks all processing jobs as failed) - Runtime mode: conservative cleanup (only marks jobs >12h as failed, skips deployments) - Add cleanup:redis --restart flag for system startup - Integrate cleanup into Dev.php init() for development environment - Increase scheduled-errors log retention from 7 to 14 days - Create comprehensive test suite (unit and feature tests) - Add TESTING_GUIDE.md with manual testing instructions Fixes issues with jobs failing after single attempt and "attempted too many times" errors
This commit is contained in:
@@ -7,7 +7,7 @@ use Illuminate\Support\Facades\Redis;
|
||||
|
||||
class CleanupRedis extends Command
|
||||
{
|
||||
protected $signature = 'cleanup:redis {--dry-run : Show what would be deleted without actually deleting} {--skip-overlapping : Skip overlapping queue cleanup} {--clear-locks : Clear stale WithoutOverlapping locks}';
|
||||
protected $signature = 'cleanup:redis {--dry-run : Show what would be deleted without actually deleting} {--skip-overlapping : Skip overlapping queue cleanup} {--clear-locks : Clear stale WithoutOverlapping locks} {--restart : Aggressive cleanup mode for system restart (marks all processing jobs as failed)}';
|
||||
|
||||
protected $description = 'Cleanup Redis (Horizon jobs, metrics, overlapping queues, cache locks, and related data)';
|
||||
|
||||
@@ -63,6 +63,14 @@ class CleanupRedis extends Command
|
||||
$deletedCount += $locksCleaned;
|
||||
}
|
||||
|
||||
// Clean up stuck jobs (restart mode = aggressive, runtime mode = conservative)
|
||||
$isRestart = $this->option('restart');
|
||||
if ($isRestart || $this->option('clear-locks')) {
|
||||
$this->info($isRestart ? 'Cleaning up stuck jobs (RESTART MODE - aggressive)...' : 'Checking for stuck jobs (runtime mode - conservative)...');
|
||||
$jobsCleaned = $this->cleanupStuckJobs($redis, $prefix, $dryRun, $isRestart);
|
||||
$deletedCount += $jobsCleaned;
|
||||
}
|
||||
|
||||
if ($dryRun) {
|
||||
$this->info("DRY RUN: Would delete {$deletedCount} out of {$totalKeys} keys");
|
||||
} else {
|
||||
@@ -332,4 +340,98 @@ class CleanupRedis extends Command
|
||||
|
||||
return $cleanedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up stuck jobs based on mode (restart vs runtime).
|
||||
*
|
||||
* @param mixed $redis Redis connection
|
||||
* @param string $prefix Horizon prefix
|
||||
* @param bool $dryRun Dry run mode
|
||||
* @param bool $isRestart Restart mode (aggressive) vs runtime mode (conservative)
|
||||
* @return int Number of jobs cleaned
|
||||
*/
|
||||
private function cleanupStuckJobs($redis, string $prefix, bool $dryRun, bool $isRestart): int
|
||||
{
|
||||
$cleanedCount = 0;
|
||||
$now = time();
|
||||
|
||||
// Get all keys with the horizon prefix
|
||||
$keys = $redis->keys('*');
|
||||
|
||||
foreach ($keys as $key) {
|
||||
$keyWithoutPrefix = str_replace($prefix, '', $key);
|
||||
$type = $redis->command('type', [$keyWithoutPrefix]);
|
||||
|
||||
// Only process hash-type keys (individual jobs)
|
||||
if ($type !== 5) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$data = $redis->command('hgetall', [$keyWithoutPrefix]);
|
||||
$status = data_get($data, 'status');
|
||||
$payload = data_get($data, 'payload');
|
||||
|
||||
// Only process jobs in "processing" or "reserved" state
|
||||
if (! in_array($status, ['processing', 'reserved'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse job payload to get job class and started time
|
||||
$payloadData = json_decode($payload, true);
|
||||
$jobClass = data_get($payloadData, 'displayName', 'Unknown');
|
||||
$pushedAt = (int) data_get($data, 'pushed_at', 0);
|
||||
|
||||
// Calculate how long the job has been processing
|
||||
$processingTime = $now - $pushedAt;
|
||||
|
||||
$shouldFail = false;
|
||||
$reason = '';
|
||||
|
||||
if ($isRestart) {
|
||||
// RESTART MODE: Mark ALL processing/reserved jobs as failed
|
||||
// Safe because all workers are dead on restart
|
||||
$shouldFail = true;
|
||||
$reason = 'System restart - all workers terminated';
|
||||
} else {
|
||||
// RUNTIME MODE: Only mark truly stuck jobs as failed
|
||||
// Be conservative to avoid killing legitimate long-running jobs
|
||||
|
||||
// Skip ApplicationDeploymentJob entirely (has dynamic_timeout, can run 2+ hours)
|
||||
if (str_contains($jobClass, 'ApplicationDeploymentJob')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip DatabaseBackupJob (large backups can take hours)
|
||||
if (str_contains($jobClass, 'DatabaseBackupJob')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For other jobs, only fail if processing > 12 hours
|
||||
if ($processingTime > 43200) { // 12 hours
|
||||
$shouldFail = true;
|
||||
$reason = 'Processing for more than 12 hours';
|
||||
}
|
||||
}
|
||||
|
||||
if ($shouldFail) {
|
||||
if ($dryRun) {
|
||||
$this->warn(" Would mark as FAILED: {$jobClass} (processing for ".round($processingTime / 60, 1)." min) - {$reason}");
|
||||
} else {
|
||||
// Mark job as failed
|
||||
$redis->command('hset', [$keyWithoutPrefix, 'status', 'failed']);
|
||||
$redis->command('hset', [$keyWithoutPrefix, 'failed_at', $now]);
|
||||
$redis->command('hset', [$keyWithoutPrefix, 'exception', "Job cleaned up by cleanup:redis - {$reason}"]);
|
||||
|
||||
$this->info(" ✓ Marked as FAILED: {$jobClass} (processing for ".round($processingTime / 60, 1).' min) - '.$reason);
|
||||
}
|
||||
$cleanedCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if ($cleanedCount === 0) {
|
||||
$this->info($isRestart ? ' No jobs to clean up' : ' No stuck jobs found (all jobs running normally)');
|
||||
}
|
||||
|
||||
return $cleanedCount;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user