feat(jobs): improve scheduled tasks with retry logic and queue cleanup

- Add retry configuration to CoolifyTask (3 tries, 600s timeout)
- Add retry configuration to ScheduledTaskJob (3 tries, configurable timeout)
- Add retry configuration to DatabaseBackupJob (2 tries)
- Implement exponential backoff for all jobs (30s, 60s, 120s intervals)
- Add failed() handlers with comprehensive error logging to scheduled-errors channel
- Add execution tracking: started_at, retry_count, duration (decimal), error_details
- Add configurable timeout field to scheduled tasks (60-3600s, default 300s)
- Update UI to include timeout configuration in task creation/editing forms
- Increase ScheduledJobManager lock expiration from 60s to 90s for high-load environments
- Implement safe queue cleanup with restart vs runtime modes
  - Restart mode: aggressive cleanup (marks all processing jobs as failed)
  - Runtime mode: conservative cleanup (only marks jobs >12h as failed, skips deployments)
- Add cleanup:redis --restart flag for system startup
- Integrate cleanup into Dev.php init() for development environment
- Increase scheduled-errors log retention from 7 to 14 days
- Create comprehensive test suite (unit and feature tests)
- Add TESTING_GUIDE.md with manual testing instructions

Fixes issues with jobs failing after single attempt and "attempted too many times" errors
This commit is contained in:
Andras Bacsai
2025-11-10 11:11:18 +01:00
parent 775216e7a5
commit b22e79caec
22 changed files with 762 additions and 37 deletions

View File

@@ -23,6 +23,7 @@ use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Str;
use Throwable;
use Visus\Cuid2\Cuid2;
@@ -31,6 +32,16 @@ class DatabaseBackupJob implements ShouldBeEncrypted, ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
/**
* The number of times the job may be attempted.
*/
public $tries = 2;
/**
* The maximum number of unhandled exceptions to allow before failing.
*/
public $maxExceptions = 1;
public ?Team $team = null;
public Server $server;
@@ -659,17 +670,42 @@ class DatabaseBackupJob implements ShouldBeEncrypted, ShouldQueue
return "{$helperImage}:{$latestVersion}";
}
/**
* Calculate the number of seconds to wait before retrying the job.
*/
public function backoff(): array
{
return [60, 300]; // 1min, 5min between retries
}
public function failed(?Throwable $exception): void
{
Log::channel('scheduled-errors')->error('DatabaseBackup permanently failed', [
'job' => 'DatabaseBackupJob',
'backup_id' => $this->backup->uuid,
'database' => $this->database?->name ?? 'unknown',
'database_type' => get_class($this->database ?? new \stdClass),
'server' => $this->server?->name ?? 'unknown',
'total_attempts' => $this->attempts(),
'error' => $exception?->getMessage(),
'trace' => $exception?->getTraceAsString(),
]);
$log = ScheduledDatabaseBackupExecution::where('uuid', $this->backup_log_uuid)->first();
if ($log) {
$log->update([
'status' => 'failed',
'message' => 'Job failed: '.($exception?->getMessage() ?? 'Unknown error'),
'message' => 'Job permanently failed after '.$this->attempts().' attempts: '.($exception?->getMessage() ?? 'Unknown error'),
'size' => 0,
'filename' => null,
'finished_at' => Carbon::now(),
]);
}
// Notify team about permanent failure
if ($this->team) {
$this->team->notify(new BackupFailed($this->backup, $this->database, $this->backup_output));
}
}
}