From 7cf6bb78d69f3442bc2cf9a5a8f02f003f67a441 Mon Sep 17 00:00:00 2001 From: Oscar Zhou <100548325+oscarzhou-portainer@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:01:13 +1200 Subject: [PATCH] fix(container): inaccurate healthy container count [BE-2290] (#1114) --- api/docker/container_stats.go | 37 ---- api/docker/container_stats_test.go | 27 --- api/docker/stats/container_stats.go | 92 +++++++++ api/docker/stats/container_stats_test.go | 234 +++++++++++++++++++++++ api/http/handler/docker/dashboard.go | 21 +- go.mod | 1 + go.sum | 4 - pkg/snapshot/docker.go | 43 +---- 8 files changed, 347 insertions(+), 112 deletions(-) delete mode 100644 api/docker/container_stats.go delete mode 100644 api/docker/container_stats_test.go create mode 100644 api/docker/stats/container_stats.go create mode 100644 api/docker/stats/container_stats_test.go diff --git a/api/docker/container_stats.go b/api/docker/container_stats.go deleted file mode 100644 index 321d2e22c..000000000 --- a/api/docker/container_stats.go +++ /dev/null @@ -1,37 +0,0 @@ -package docker - -import "github.com/docker/docker/api/types" - -type ContainerStats struct { - Running int `json:"running"` - Stopped int `json:"stopped"` - Healthy int `json:"healthy"` - Unhealthy int `json:"unhealthy"` - Total int `json:"total"` -} - -func CalculateContainerStats(containers []types.Container) ContainerStats { - var running, stopped, healthy, unhealthy int - for _, container := range containers { - switch container.State { - case "running": - running++ - case "healthy": - running++ - healthy++ - case "unhealthy": - running++ - unhealthy++ - case "exited", "stopped": - stopped++ - } - } - - return ContainerStats{ - Running: running, - Stopped: stopped, - Healthy: healthy, - Unhealthy: unhealthy, - Total: len(containers), - } -} diff --git a/api/docker/container_stats_test.go b/api/docker/container_stats_test.go deleted file mode 100644 index 5422a276c..000000000 --- a/api/docker/container_stats_test.go +++ /dev/null @@ -1,27 +0,0 @@ -package docker - -import ( - "testing" - - "github.com/docker/docker/api/types" - "github.com/stretchr/testify/assert" -) - -func TestCalculateContainerStats(t *testing.T) { - containers := []types.Container{ - {State: "running"}, - {State: "running"}, - {State: "exited"}, - {State: "stopped"}, - {State: "healthy"}, - {State: "unhealthy"}, - } - - stats := CalculateContainerStats(containers) - - assert.Equal(t, 4, stats.Running) - assert.Equal(t, 2, stats.Stopped) - assert.Equal(t, 1, stats.Healthy) - assert.Equal(t, 1, stats.Unhealthy) - assert.Equal(t, 6, stats.Total) -} diff --git a/api/docker/stats/container_stats.go b/api/docker/stats/container_stats.go new file mode 100644 index 000000000..15e88f407 --- /dev/null +++ b/api/docker/stats/container_stats.go @@ -0,0 +1,92 @@ +package stats + +import ( + "context" + "errors" + "sync" + + "github.com/docker/docker/api/types/container" +) + +type ContainerStats struct { + Running int `json:"running"` + Stopped int `json:"stopped"` + Healthy int `json:"healthy"` + Unhealthy int `json:"unhealthy"` + Total int `json:"total"` +} + +type DockerClient interface { + ContainerInspect(ctx context.Context, containerID string) (container.InspectResponse, error) +} + +func CalculateContainerStats(ctx context.Context, cli DockerClient, containers []container.Summary) (ContainerStats, error) { + var running, stopped, healthy, unhealthy int + + var mu sync.Mutex + var wg sync.WaitGroup + semaphore := make(chan struct{}, 5) + + var aggErr error + var aggMu sync.Mutex + + for i := range containers { + id := containers[i].ID + semaphore <- struct{}{} + wg.Go(func() { + defer func() { <-semaphore }() + + containerInspection, err := cli.ContainerInspect(ctx, id) + stat := ContainerStats{} + if err != nil { + aggMu.Lock() + aggErr = errors.Join(aggErr, err) + aggMu.Unlock() + return + } + stat = getContainerStatus(containerInspection.State) + + mu.Lock() + running += stat.Running + stopped += stat.Stopped + healthy += stat.Healthy + unhealthy += stat.Unhealthy + mu.Unlock() + }) + } + + wg.Wait() + + return ContainerStats{ + Running: running, + Stopped: stopped, + Healthy: healthy, + Unhealthy: unhealthy, + Total: len(containers), + }, aggErr +} + +func getContainerStatus(state *container.State) ContainerStats { + stat := ContainerStats{} + if state == nil { + return stat + } + + switch state.Status { + case container.StateRunning: + stat.Running++ + case container.StateExited, container.StateDead: + stat.Stopped++ + } + + if state.Health != nil { + switch state.Health.Status { + case container.Healthy: + stat.Healthy++ + case container.Unhealthy: + stat.Unhealthy++ + } + } + + return stat +} diff --git a/api/docker/stats/container_stats_test.go b/api/docker/stats/container_stats_test.go new file mode 100644 index 000000000..ae2fa6e2f --- /dev/null +++ b/api/docker/stats/container_stats_test.go @@ -0,0 +1,234 @@ +package stats + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/docker/docker/api/types/container" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +// MockDockerClient implements the DockerClient interface for testing +type MockDockerClient struct { + mock.Mock +} + +func (m *MockDockerClient) ContainerInspect(ctx context.Context, containerID string) (container.InspectResponse, error) { + args := m.Called(ctx, containerID) + return args.Get(0).(container.InspectResponse), args.Error(1) +} + +func TestCalculateContainerStats(t *testing.T) { + mockClient := new(MockDockerClient) + + // Test containers - using enough containers to test concurrent processing + containers := []container.Summary{ + {ID: "container1"}, + {ID: "container2"}, + {ID: "container3"}, + {ID: "container4"}, + {ID: "container5"}, + {ID: "container6"}, + {ID: "container7"}, + {ID: "container8"}, + {ID: "container9"}, + {ID: "container10"}, + } + + // Setup mock expectations with different container states to test various scenarios + containerStates := []struct { + id string + status string + health *container.Health + expected ContainerStats + }{ + {"container1", container.StateRunning, &container.Health{Status: container.Healthy}, ContainerStats{Running: 1, Stopped: 0, Healthy: 1, Unhealthy: 0}}, + {"container2", container.StateRunning, &container.Health{Status: container.Unhealthy}, ContainerStats{Running: 1, Stopped: 0, Healthy: 0, Unhealthy: 1}}, + {"container3", container.StateRunning, nil, ContainerStats{Running: 1, Stopped: 0, Healthy: 0, Unhealthy: 0}}, + {"container4", container.StateExited, nil, ContainerStats{Running: 0, Stopped: 1, Healthy: 0, Unhealthy: 0}}, + {"container5", container.StateDead, nil, ContainerStats{Running: 0, Stopped: 1, Healthy: 0, Unhealthy: 0}}, + {"container6", container.StateRunning, &container.Health{Status: container.Healthy}, ContainerStats{Running: 1, Stopped: 0, Healthy: 1, Unhealthy: 0}}, + {"container7", container.StateRunning, &container.Health{Status: container.Unhealthy}, ContainerStats{Running: 1, Stopped: 0, Healthy: 0, Unhealthy: 1}}, + {"container8", container.StateExited, nil, ContainerStats{Running: 0, Stopped: 1, Healthy: 0, Unhealthy: 0}}, + {"container9", container.StateRunning, nil, ContainerStats{Running: 1, Stopped: 0, Healthy: 0, Unhealthy: 0}}, + {"container10", container.StateDead, nil, ContainerStats{Running: 0, Stopped: 1, Healthy: 0, Unhealthy: 0}}, + } + + expected := ContainerStats{} + // Setup mock expectations for all containers with artificial delays to simulate real Docker calls + for _, state := range containerStates { + mockClient.On("ContainerInspect", mock.Anything, state.id).Return(container.InspectResponse{ + ContainerJSONBase: &container.ContainerJSONBase{ + State: &container.State{ + Status: state.status, + Health: state.health, + }, + }, + }, nil).After(50 * time.Millisecond) // Simulate 50ms Docker API call + + expected.Running += state.expected.Running + expected.Stopped += state.expected.Stopped + expected.Healthy += state.expected.Healthy + expected.Unhealthy += state.expected.Unhealthy + expected.Total++ + } + + // Call the function and measure time + startTime := time.Now() + stats, err := CalculateContainerStats(context.Background(), mockClient, containers) + require.NoError(t, err, "failed to calculate container stats") + duration := time.Since(startTime) + + // Assert results + assert.Equal(t, expected, stats) + assert.Equal(t, expected.Running, stats.Running) + assert.Equal(t, expected.Stopped, stats.Stopped) + assert.Equal(t, expected.Healthy, stats.Healthy) + assert.Equal(t, expected.Unhealthy, stats.Unhealthy) + assert.Equal(t, 10, stats.Total) + + // Verify concurrent processing by checking that all mock calls were made + mockClient.AssertExpectations(t) + + // Test concurrency: With 5 workers and 10 containers taking 50ms each: + // Sequential would take: 10 * 50ms = 500ms + sequentialTime := 10 * 50 * time.Millisecond + + // Verify that concurrent processing is actually faster than sequential + // Allow some overhead for goroutine scheduling + assert.Less(t, duration, sequentialTime, "Concurrent processing should be faster than sequential") + // Concurrent should take: ~100-150ms (depending on scheduling) + assert.Less(t, duration, 150*time.Millisecond, "Concurrent processing should be significantly faster") + assert.Greater(t, duration, 100*time.Millisecond, "Concurrent processing should be longer than 100ms") +} + +func TestCalculateContainerStatsAllErrors(t *testing.T) { + mockClient := new(MockDockerClient) + + // Test containers + containers := []container.Summary{ + {ID: "container1"}, + {ID: "container2"}, + } + + // Setup mock expectations with all calls returning errors + mockClient.On("ContainerInspect", mock.Anything, "container1").Return(container.InspectResponse{}, errors.New("network error")) + mockClient.On("ContainerInspect", mock.Anything, "container2").Return(container.InspectResponse{}, errors.New("permission denied")) + + // Call the function + stats, err := CalculateContainerStats(context.Background(), mockClient, containers) + + // Assert that an error was returned + require.Error(t, err, "should return error when all containers fail to inspect") + assert.Contains(t, err.Error(), "network error", "error should contain one of the original error messages") + assert.Contains(t, err.Error(), "permission denied", "error should contain the other original error message") + + // Assert that stats are zero since no containers were successfully processed + expectedStats := ContainerStats{ + Running: 0, + Stopped: 0, + Healthy: 0, + Unhealthy: 0, + Total: 2, // total containers processed + } + assert.Equal(t, expectedStats, stats) + + // Verify all mock calls were made + mockClient.AssertExpectations(t) +} + +func TestGetContainerStatus(t *testing.T) { + testCases := []struct { + name string + state *container.State + expected ContainerStats + }{ + { + name: "running healthy container", + state: &container.State{ + Status: container.StateRunning, + Health: &container.Health{ + Status: container.Healthy, + }, + }, + expected: ContainerStats{ + Running: 1, + Stopped: 0, + Healthy: 1, + Unhealthy: 0, + }, + }, + { + name: "running unhealthy container", + state: &container.State{ + Status: container.StateRunning, + Health: &container.Health{ + Status: container.Unhealthy, + }, + }, + expected: ContainerStats{ + Running: 1, + Stopped: 0, + Healthy: 0, + Unhealthy: 1, + }, + }, + { + name: "running container without health check", + state: &container.State{ + Status: container.StateRunning, + }, + expected: ContainerStats{ + Running: 1, + Stopped: 0, + Healthy: 0, + Unhealthy: 0, + }, + }, + { + name: "exited container", + state: &container.State{ + Status: container.StateExited, + }, + expected: ContainerStats{ + Running: 0, + Stopped: 1, + Healthy: 0, + Unhealthy: 0, + }, + }, + { + name: "dead container", + state: &container.State{ + Status: container.StateDead, + }, + expected: ContainerStats{ + Running: 0, + Stopped: 1, + Healthy: 0, + Unhealthy: 0, + }, + }, + { + name: "nil state", + state: nil, + expected: ContainerStats{ + Running: 0, + Stopped: 0, + Healthy: 0, + Unhealthy: 0, + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + stat := getContainerStatus(testCase.state) + assert.Equal(t, testCase.expected, stat) + }) + } +} diff --git a/api/http/handler/docker/dashboard.go b/api/http/handler/docker/dashboard.go index c220106b7..f903711ec 100644 --- a/api/http/handler/docker/dashboard.go +++ b/api/http/handler/docker/dashboard.go @@ -11,7 +11,7 @@ import ( "github.com/docker/docker/api/types/volume" portainer "github.com/portainer/portainer/api" "github.com/portainer/portainer/api/dataservices" - "github.com/portainer/portainer/api/docker" + "github.com/portainer/portainer/api/docker/stats" "github.com/portainer/portainer/api/http/errors" "github.com/portainer/portainer/api/http/handler/docker/utils" "github.com/portainer/portainer/api/http/middlewares" @@ -26,12 +26,12 @@ type imagesCounters struct { } type dashboardResponse struct { - Containers docker.ContainerStats `json:"containers"` - Services int `json:"services"` - Images imagesCounters `json:"images"` - Volumes int `json:"volumes"` - Networks int `json:"networks"` - Stacks int `json:"stacks"` + Containers stats.ContainerStats `json:"containers"` + Services int `json:"services"` + Images imagesCounters `json:"images"` + Volumes int `json:"volumes"` + Networks int `json:"networks"` + Stacks int `json:"stacks"` } // @id dockerDashboard @@ -144,13 +144,18 @@ func (h *Handler) dashboard(w http.ResponseWriter, r *http.Request) *httperror.H stackCount = len(stacks) } + containersStats, err := stats.CalculateContainerStats(r.Context(), cli, containers) + if err != nil { + return httperror.InternalServerError("Unable to retrieve Docker containers stats", err) + } + resp = dashboardResponse{ Images: imagesCounters{ Total: len(images), Size: totalSize, }, Services: len(services), - Containers: docker.CalculateContainerStats(containers), + Containers: containersStats, Networks: len(networks), Volumes: len(volumes), Stacks: stackCount, diff --git a/go.mod b/go.mod index d73c23a77..eb1dceeea 100644 --- a/go.mod +++ b/go.mod @@ -254,6 +254,7 @@ require ( github.com/spf13/cast v1.7.0 // indirect github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/pflag v1.0.7 // indirect + github.com/stretchr/objx v0.5.2 // indirect github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect github.com/theupdateframework/notary v0.7.0 // indirect github.com/tilt-dev/fsnotify v1.4.8-0.20220602155310-fff9c274a375 // indirect diff --git a/go.sum b/go.sum index 6f45c99d0..f569ea9be 100644 --- a/go.sum +++ b/go.sum @@ -785,8 +785,6 @@ github.com/zmap/zcrypto v0.0.0-20241123155728-2916694fa469/go.mod h1:sUuKi10EbW7 github.com/zmap/zlint/v3 v3.0.0/go.mod h1:paGwFySdHIBEMJ61YjoqT4h7Ge+fdYG4sUQhnTb1lJ8= github.com/zmap/zlint/v3 v3.6.4 h1:r2kHfRF7mIsxW0IH4Og2iZnrlpCLTZBFjnXy1x/ZnZI= github.com/zmap/zlint/v3 v3.6.4/go.mod h1:KQLVUquVaO5YJDl5a4k/7RPIbIW2v66+sRoBPNZusI8= -go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= -go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= @@ -955,8 +953,6 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= diff --git a/pkg/snapshot/docker.go b/pkg/snapshot/docker.go index 12f3a3089..255186063 100644 --- a/pkg/snapshot/docker.go +++ b/pkg/snapshot/docker.go @@ -10,6 +10,7 @@ import ( portainer "github.com/portainer/portainer/api" "github.com/portainer/portainer/api/docker/consts" + "github.com/portainer/portainer/api/docker/stats" edgeutils "github.com/portainer/portainer/pkg/edge" networkingutils "github.com/portainer/portainer/pkg/networking" @@ -131,7 +132,8 @@ func dockerSnapshotSwarmServices(snapshot *portainer.DockerSnapshot, cli *client } func dockerSnapshotContainers(snapshot *portainer.DockerSnapshot, cli *client.Client) error { - containers, err := cli.ContainerList(context.Background(), dockercontainer.ListOptions{All: true}) + ctx := context.Background() + containers, err := cli.ContainerList(ctx, dockercontainer.ListOptions{All: true}) if err != nil { return err } @@ -207,7 +209,10 @@ func dockerSnapshotContainers(snapshot *portainer.DockerSnapshot, cli *client.Cl snapshot.GpuUseAll = gpuUseAll snapshot.GpuUseList = gpuUseList - stats := calculateContainerStats(containers) + stats, err := stats.CalculateContainerStats(ctx, cli, containers) + if err != nil { + return fmt.Errorf("failed to calculate container stats: %w", err) + } snapshot.ContainerCount = stats.Total snapshot.RunningContainerCount = stats.Running @@ -344,37 +349,3 @@ func isPodman(version types.Version) bool { return false } - -type ContainerStats struct { - Running int - Stopped int - Healthy int - Unhealthy int - Total int -} - -func calculateContainerStats(containers []types.Container) ContainerStats { - var running, stopped, healthy, unhealthy int - for _, container := range containers { - switch container.State { - case "running": - running++ - case "healthy": - running++ - healthy++ - case "unhealthy": - running++ - unhealthy++ - case "exited", "stopped": - stopped++ - } - } - - return ContainerStats{ - Running: running, - Stopped: stopped, - Healthy: healthy, - Unhealthy: unhealthy, - Total: len(containers), - } -}