From 901da4545ded423b44d115d933ddb0f62c320c9d Mon Sep 17 00:00:00 2001 From: Derek Menteer Date: Fri, 2 Feb 2024 12:27:25 -0600 Subject: [PATCH] Fix CICD test flakes by locking container socket. --- .../consul-container/libs/cluster/container.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/integration/consul-container/libs/cluster/container.go b/test/integration/consul-container/libs/cluster/container.go index eeddd1fe56..8cbacb729d 100644 --- a/test/integration/consul-container/libs/cluster/container.go +++ b/test/integration/consul-container/libs/cluster/container.go @@ -13,6 +13,7 @@ import ( "os/exec" "path/filepath" "strconv" + "sync" "time" goretry "github.com/avast/retry-go" @@ -40,6 +41,16 @@ const ServiceUpstreamLocalBindPort = 5000 // local bind Port of service's upstr const ServiceUpstreamLocalBindPort2 = 5001 // local bind Port of service's upstream, for services with 2 upstreams const debugPort = "4000/tcp" +// containerLock prevents starting multiple containers concurrently. This has not been confirmed as being necessary, but +// it seems to help make the CICD pipeline pass without failures. These failures seem to be due to some form of docker +// socket contention with errors of the form: +// +// #1: error starting pod with image "docker.mirror.hashicorp.services/hashiderek/pause": Post "http://%2Fvar%2Frun%2Fdocker.sock/v1.43/containers/9b0e568744793e558d318af908c1052ab3d4d2f5a74c67b15d47a0570f141b1c/start": context deadline exceeded: failed to start container +// +// It may purely be due to the fact that starting containers takes longer than expected, and this lock avoids starting +// the context cancel timer until after we have ensured the docker socket is freed up. +var containerLock sync.Mutex + // consulContainerNode implements the Agent interface by running a Consul agent // in a container. type consulContainerNode struct { @@ -600,6 +611,8 @@ func (c *consulContainerNode) DataDir() string { } func startContainer(ctx context.Context, req testcontainers.ContainerRequest) (testcontainers.Container, error) { + containerLock.Lock() + defer containerLock.Unlock() ctx, cancel := context.WithTimeout(ctx, time.Second*40) defer cancel() return testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{