fix: correct Sentinel default health status and remove debug logging

This commit addresses container status reporting issues and removes debug logging:

**Primary Fix:**
- Changed PushServerUpdateJob to default to 'unknown' instead of 'unhealthy' when health_status field is missing from Sentinel data
- This ensures containers WITHOUT healthcheck defined are correctly reported as "unknown" not "unhealthy"
- Matches SSH path behavior (GetContainersStatus) which already defaulted to 'unknown'

**Service Multi-Container Aggregation:**
- Implemented service container status aggregation (same pattern as applications)
- Added serviceContainerStatuses collection to both Sentinel and SSH paths
- Services now aggregate status using priority: unhealthy > unknown > healthy
- Prevents race conditions where last-processed container would win

**Debug Logging Cleanup:**
- Removed all [STATUS-DEBUG] logging statements (25 total)
- Removed all ray() debugging calls (3 total)
- Removed proof_unknown_preserved and health_status_was_null debug fields
- Code is now production-ready

**Test Coverage:**
- Added 2 new tests for Sentinel default health status behavior
- Added 5 new tests for service aggregation in SSH path
- All 16 tests pass (66 assertions)

**Note:** The root cause was identified as Sentinel (Go binary) also defaulting to "unhealthy". That will need a separate fix in the Sentinel codebase.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Andras Bacsai
2025-11-20 11:10:34 +01:00
parent 747a48b933
commit 14bba8ba86
4 changed files with 447 additions and 119 deletions

View File

@@ -21,7 +21,6 @@ use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\Middleware\WithoutOverlapping;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Log;
use Laravel\Horizon\Contracts\Silenced;
class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
@@ -68,6 +67,8 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
public Collection $applicationContainerStatuses;
public Collection $serviceContainerStatuses;
public bool $foundProxy = false;
public bool $foundLogDrainContainer = false;
@@ -91,6 +92,7 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
$this->foundApplicationPreviewsIds = collect();
$this->foundServiceDatabaseIds = collect();
$this->applicationContainerStatuses = collect();
$this->serviceContainerStatuses = collect();
$this->allApplicationIds = collect();
$this->allDatabaseUuids = collect();
$this->allTcpProxyUuids = collect();
@@ -109,14 +111,6 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
$this->server->sentinelHeartbeat();
$this->containers = collect(data_get($data, 'containers'));
Log::debug('[STATUS-DEBUG] Raw Sentinel data received', [
'source' => 'PushServerUpdateJob',
'container_count' => $this->containers->count(),
'containers' => $this->containers->toArray(),
]);
ray('Raw Sentinel containers:', $this->containers->toArray());
$filesystemUsageRoot = data_get($data, 'filesystem_usage_root.used_percentage');
ServerStorageCheckJob::dispatch($this->server, $filesystemUsageRoot);
@@ -149,25 +143,13 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
foreach ($this->containers as $container) {
$containerStatus = data_get($container, 'state', 'exited');
$containerHealth = data_get($container, 'health_status', 'unhealthy');
$rawHealthStatus = data_get($container, 'health_status');
$containerHealth = $rawHealthStatus ?? 'unknown';
$containerStatus = "$containerStatus ($containerHealth)";
$labels = collect(data_get($container, 'labels'));
$coolify_managed = $labels->has('coolify.managed');
Log::debug('[STATUS-DEBUG] Processing container from Sentinel', [
'source' => 'PushServerUpdateJob (loop)',
'container_name' => data_get($container, 'name'),
'container_status' => $containerStatus,
'labels' => $labels->toArray(),
'has_coolify_managed' => $coolify_managed,
]);
if (! $coolify_managed) {
Log::debug('[STATUS-DEBUG] Container skipped - not coolify managed', [
'source' => 'PushServerUpdateJob',
'container_name' => data_get($container, 'name'),
]);
continue;
}
@@ -191,19 +173,6 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
$containerName = $labels->get('com.docker.compose.service');
if ($containerName) {
$this->applicationContainerStatuses->get($applicationId)->put($containerName, $containerStatus);
Log::debug('[STATUS-DEBUG] Container added to applicationContainerStatuses', [
'source' => 'PushServerUpdateJob',
'application_id' => $applicationId,
'container_name' => $containerName,
'container_status' => $containerStatus,
]);
} else {
Log::debug('[STATUS-DEBUG] Container skipped - no com.docker.compose.service label', [
'source' => 'PushServerUpdateJob',
'container_name' => data_get($container, 'name'),
'application_id' => $applicationId,
'labels' => $labels->toArray(),
]);
}
} else {
$previewKey = $applicationId.':'.$pullRequestId;
@@ -218,12 +187,32 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
$serviceId = $labels->get('coolify.serviceId');
$subType = $labels->get('coolify.service.subType');
$subId = $labels->get('coolify.service.subId');
if ($subType === 'application' && $this->isRunning($containerStatus)) {
$this->foundServiceApplicationIds->push($subId);
$this->updateServiceSubStatus($serviceId, $subType, $subId, $containerStatus);
} elseif ($subType === 'database' && $this->isRunning($containerStatus)) {
$this->foundServiceDatabaseIds->push($subId);
$this->updateServiceSubStatus($serviceId, $subType, $subId, $containerStatus);
if ($subType === 'application') {
if ($this->isRunning($containerStatus)) {
$this->foundServiceApplicationIds->push($subId);
}
// Store container status for aggregation
$key = $serviceId.':'.$subType.':'.$subId;
if (! $this->serviceContainerStatuses->has($key)) {
$this->serviceContainerStatuses->put($key, collect());
}
$containerName = $labels->get('com.docker.compose.service');
if ($containerName) {
$this->serviceContainerStatuses->get($key)->put($containerName, $containerStatus);
}
} elseif ($subType === 'database') {
if ($this->isRunning($containerStatus)) {
$this->foundServiceDatabaseIds->push($subId);
}
// Store container status for aggregation
$key = $serviceId.':'.$subType.':'.$subId;
if (! $this->serviceContainerStatuses->has($key)) {
$this->serviceContainerStatuses->put($key, collect());
}
$containerName = $labels->get('com.docker.compose.service');
if ($containerName) {
$this->serviceContainerStatuses->get($key)->put($containerName, $containerStatus);
}
}
} else {
$uuid = $labels->get('com.docker.compose.service');
@@ -257,27 +246,20 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
// Aggregate multi-container application statuses
$this->aggregateMultiContainerStatuses();
// Aggregate multi-container service statuses
$this->aggregateServiceContainerStatuses();
$this->checkLogDrainContainer();
}
private function aggregateMultiContainerStatuses()
{
Log::debug('[STATUS-DEBUG] Starting aggregation of multi-container application statuses', [
'source' => 'PushServerUpdateJob',
]);
ray('Starting aggregation of multi-container application statuses');
ray($this->applicationContainerStatuses->toArray());
if ($this->applicationContainerStatuses->isEmpty()) {
return;
}
foreach ($this->applicationContainerStatuses as $applicationId => $containerStatuses) {
$application = $this->applications->where('id', $applicationId)->first();
Log::debug('[STATUS-DEBUG] Processing application for aggregation', [
'source' => 'PushServerUpdateJob',
'app_id' => $applicationId,
'container_statuses' => $containerStatuses->toArray(),
]);
if (! $application) {
continue;
}
@@ -345,19 +327,7 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
// All containers are exited
$aggregatedStatus = 'exited (unhealthy)';
}
Log::debug('[STATUS-DEBUG] Sentinel status change', [
'source' => 'PushServerUpdateJob',
'app_id' => $application->id,
'app_name' => $application->name,
'old_status' => $application->status,
'new_status' => $aggregatedStatus,
'container_statuses' => $relevantStatuses->toArray(),
'flags' => [
'hasRunning' => $hasRunning,
'hasUnhealthy' => $hasUnhealthy,
'hasUnknown' => $hasUnknown,
],
]);
// Update application status with aggregated result
if ($aggregatedStatus && $application->status !== $aggregatedStatus) {
@@ -367,6 +337,105 @@ class PushServerUpdateJob implements ShouldBeEncrypted, ShouldQueue, Silenced
}
}
private function aggregateServiceContainerStatuses()
{
if ($this->serviceContainerStatuses->isEmpty()) {
return;
}
foreach ($this->serviceContainerStatuses as $key => $containerStatuses) {
// Parse key: serviceId:subType:subId
[$serviceId, $subType, $subId] = explode(':', $key);
$service = $this->services->where('id', $serviceId)->first();
if (! $service) {
continue;
}
// Get the service sub-resource (ServiceApplication or ServiceDatabase)
$subResource = null;
if ($subType === 'application') {
$subResource = $service->applications()->where('id', $subId)->first();
} elseif ($subType === 'database') {
$subResource = $service->databases()->where('id', $subId)->first();
}
if (! $subResource) {
continue;
}
// Parse docker compose from service to check for excluded containers
$dockerComposeRaw = data_get($service, 'docker_compose_raw');
$excludedContainers = collect();
if ($dockerComposeRaw) {
try {
$dockerCompose = \Symfony\Component\Yaml\Yaml::parse($dockerComposeRaw);
$services = data_get($dockerCompose, 'services', []);
foreach ($services as $serviceName => $serviceConfig) {
// Check if container should be excluded
$excludeFromHc = data_get($serviceConfig, 'exclude_from_hc', false);
$restartPolicy = data_get($serviceConfig, 'restart', 'always');
if ($excludeFromHc || $restartPolicy === 'no') {
$excludedContainers->push($serviceName);
}
}
} catch (\Exception $e) {
// If we can't parse, treat all containers as included
}
}
// Filter out excluded containers
$relevantStatuses = $containerStatuses->filter(function ($status, $containerName) use ($excludedContainers) {
return ! $excludedContainers->contains($containerName);
});
// If all containers are excluded, don't update status
if ($relevantStatuses->isEmpty()) {
continue;
}
// Aggregate status: if any container is running, service is running
$hasRunning = false;
$hasUnhealthy = false;
$hasUnknown = false;
foreach ($relevantStatuses as $status) {
if (str($status)->contains('running')) {
$hasRunning = true;
if (str($status)->contains('unhealthy')) {
$hasUnhealthy = true;
}
if (str($status)->contains('unknown')) {
$hasUnknown = true;
}
}
}
$aggregatedStatus = null;
if ($hasRunning) {
if ($hasUnhealthy) {
$aggregatedStatus = 'running (unhealthy)';
} elseif ($hasUnknown) {
$aggregatedStatus = 'running (unknown)';
} else {
$aggregatedStatus = 'running (healthy)';
}
} else {
// All containers are exited
$aggregatedStatus = 'exited (unhealthy)';
}
// Update service sub-resource status with aggregated result
if ($aggregatedStatus && $subResource->status !== $aggregatedStatus) {
$subResource->status = $aggregatedStatus;
$subResource->save();
}
}
}
private function updateApplicationStatus(string $applicationId, string $containerStatus)
{
$application = $this->applications->where('id', $applicationId)->first();