From a062c36ff50d0a02889b787cb825d0f97b3783a7 Mon Sep 17 00:00:00 2001 From: andres-portainer <91705312+andres-portainer@users.noreply.github.com> Date: Tue, 12 Sep 2023 17:52:52 -0300 Subject: [PATCH] fix(gitops): avoid cancelling the auto updates for any error EE-5604 (#10295) --- api/scheduler/scheduler.go | 30 ++++++++++++++++++++++++++---- api/scheduler/scheduler_test.go | 26 ++++++++++++++++++++++++-- api/stacks/deployments/deploy.go | 19 ++++++++++++++++--- 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/api/scheduler/scheduler.go b/api/scheduler/scheduler.go index 82e06ccec..d779175b5 100644 --- a/api/scheduler/scheduler.go +++ b/api/scheduler/scheduler.go @@ -17,6 +17,18 @@ type Scheduler struct { mu sync.Mutex } +type PermanentError struct { + err error +} + +func NewPermanentError(err error) *PermanentError { + return &PermanentError{err: err} +} + +func (e *PermanentError) Error() string { + return e.err.Error() +} + func NewScheduler(ctx context.Context) *Scheduler { crontab := cron.New(cron.WithChain(cron.Recover(cron.DefaultLogger))) crontab.Start() @@ -84,14 +96,24 @@ func (s *Scheduler) StopJob(jobID string) error { func (s *Scheduler) StartJobEvery(duration time.Duration, job func() error) string { ctx, cancel := context.WithCancel(context.Background()) - j := cron.FuncJob(func() { - if err := job(); err != nil { - log.Debug().Msg("job returned an error") + jobFn := cron.FuncJob(func() { + err := job() + if err == nil { + return + } + + var permErr *PermanentError + if errors.As(err, &permErr) { + log.Error().Err(permErr).Msg("job returned a permanent error, it will be stopped") cancel() + + return } + + log.Error().Err(err).Msg("job returned an error, it will be rescheduled") }) - entryID := s.crontab.Schedule(cron.Every(duration), j) + entryID := s.crontab.Schedule(cron.Every(duration), jobFn) s.mu.Lock() s.activeJobs[entryID] = cancel diff --git a/api/scheduler/scheduler_test.go b/api/scheduler/scheduler_test.go index 56377b4bf..126c00b6e 100644 --- a/api/scheduler/scheduler_test.go +++ b/api/scheduler/scheduler_test.go @@ -49,7 +49,7 @@ func Test_JobCanBeStopped(t *testing.T) { assert.False(t, workDone, "job shouldn't had a chance to run") } -func Test_JobShouldStop_UponError(t *testing.T) { +func Test_JobShouldStop_UponPermError(t *testing.T) { s := NewScheduler(context.Background()) defer s.Shutdown() @@ -58,7 +58,7 @@ func Test_JobShouldStop_UponError(t *testing.T) { s.StartJobEvery(jobInterval, func() error { acc++ close(ch) - return fmt.Errorf("failed") + return NewPermanentError(fmt.Errorf("failed")) }) <-time.After(3 * jobInterval) @@ -66,6 +66,28 @@ func Test_JobShouldStop_UponError(t *testing.T) { assert.Equal(t, 1, acc, "job stop after the first run because it returns an error") } +func Test_JobShouldNotStop_UponError(t *testing.T) { + s := NewScheduler(context.Background()) + defer s.Shutdown() + + var acc int + ch := make(chan struct{}) + s.StartJobEvery(jobInterval, func() error { + acc++ + + if acc == 2 { + close(ch) + return NewPermanentError(fmt.Errorf("failed")) + } + + return errors.New("non-permanent error") + }) + + <-time.After(3 * jobInterval) + <-ch + assert.Equal(t, 2, acc) +} + func Test_CanTerminateAllJobs_ByShuttingDownScheduler(t *testing.T) { s := NewScheduler(context.Background()) diff --git a/api/stacks/deployments/deploy.go b/api/stacks/deployments/deploy.go index 491074ec4..d6d5e51f0 100644 --- a/api/stacks/deployments/deploy.go +++ b/api/stacks/deployments/deploy.go @@ -8,6 +8,7 @@ import ( "github.com/portainer/portainer/api/dataservices" "github.com/portainer/portainer/api/git/update" "github.com/portainer/portainer/api/http/security" + "github.com/portainer/portainer/api/scheduler" "github.com/portainer/portainer/api/stacks/stackutils" "github.com/pkg/errors" @@ -29,7 +30,9 @@ func RedeployWhenChanged(stackID portainer.StackID, deployer StackDeployer, data log.Debug().Int("stack_id", int(stackID)).Msg("redeploying stack") stack, err := datastore.Stack().Read(stackID) - if err != nil { + if dataservices.IsErrObjectNotFound(err) { + return scheduler.NewPermanentError(errors.WithMessagef(err, "failed to get the stack %v", stackID)) + } else if err != nil { return errors.WithMessagef(err, "failed to get the stack %v", stackID) } @@ -38,7 +41,15 @@ func RedeployWhenChanged(stackID portainer.StackID, deployer StackDeployer, data } endpoint, err := datastore.Endpoint().Endpoint(stack.EndpointID) - if err != nil { + if dataservices.IsErrObjectNotFound(err) { + return scheduler.NewPermanentError( + errors.WithMessagef(err, + "failed to find the environment %v associated to the stack %v", + stack.EndpointID, + stack.ID, + ), + ) + } else if err != nil { return errors.WithMessagef(err, "failed to find the environment %v associated to the stack %v", stack.EndpointID, stack.ID) } @@ -78,7 +89,9 @@ func RedeployWhenChanged(stackID portainer.StackID, deployer StackDeployer, data } registries, err := getUserRegistries(datastore, user, endpoint.ID) - if err != nil { + if dataservices.IsErrObjectNotFound(err) { + return scheduler.NewPermanentError(err) + } else if err != nil { return err }