Merge pull request #55641 from sjenning/remove-corrupt-checkpoints

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

dockershim: remove corrupt checkpoints immediately upon detection

Fixes https://github.com/kubernetes/kubernetes/issues/55620

The current checkpoint abstraction leaks the handling of corrupt checkpoints to the user.

If the user does a `GetCheckpoint()` and the checkpoint is corrupt, the corrupt checkpoint is returned to the user (why??) with an error indicating the corruption.  It is then up to the user to detect the corruption via the error msg and call `RemoveCheckpoint()` to remove the corrupted checkpoint.

The checkpoint abstraction should not expose corruption to the user.  If it is corrupt, it is as good as not found to the user.  The checkpoint code should handle cleanup of corrupt entries and report "not found" to the user.

@derekwaynecarr @eparis @dcbw @freehan
pull/6/head
Kubernetes Submit Queue 2017-11-14 07:15:57 -08:00 committed by GitHub
commit b983cee8b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 9 additions and 83 deletions

View File

@ -175,7 +175,6 @@ pkg/kubelet/container/testing
pkg/kubelet/custommetrics
pkg/kubelet/dockershim
pkg/kubelet/dockershim/cm
pkg/kubelet/dockershim/errors
pkg/kubelet/dockershim/libdocker
pkg/kubelet/dockershim/remote
pkg/kubelet/dockershim/testing

View File

@ -47,7 +47,6 @@ go_library(
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/dockershim/cm:go_default_library",
"//pkg/kubelet/dockershim/errors:go_default_library",
"//pkg/kubelet/dockershim/libdocker:go_default_library",
"//pkg/kubelet/dockershim/metrics:go_default_library",
"//pkg/kubelet/leaky:go_default_library",
@ -142,7 +141,6 @@ filegroup(
srcs = [
":package-srcs",
"//pkg/kubelet/dockershim/cm:all-srcs",
"//pkg/kubelet/dockershim/errors:all-srcs",
"//pkg/kubelet/dockershim/libdocker:all-srcs",
"//pkg/kubelet/dockershim/metrics:all-srcs",
"//pkg/kubelet/dockershim/remote:all-srcs",

View File

@ -23,7 +23,6 @@ import (
"path/filepath"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
hashutil "k8s.io/kubernetes/pkg/util/hash"
@ -113,12 +112,14 @@ func (handler *PersistentCheckpointHandler) GetCheckpoint(podSandboxID string) (
//TODO: unmarhsal into a struct with just Version, check version, unmarshal into versioned type.
err = json.Unmarshal(blob, &checkpoint)
if err != nil {
glog.Errorf("Failed to unmarshal checkpoint %q. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
return &checkpoint, errors.CorruptCheckpointError
glog.Errorf("Failed to unmarshal checkpoint %q, removing checkpoint. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
handler.RemoveCheckpoint(podSandboxID)
return nil, fmt.Errorf("failed to unmarshal checkpoint")
}
if checkpoint.CheckSum != calculateChecksum(checkpoint) {
glog.Errorf("Checksum of checkpoint %q is not valid", podSandboxID)
return &checkpoint, errors.CorruptCheckpointError
glog.Errorf("Checksum of checkpoint %q is not valid, removing checkpoint", podSandboxID)
handler.RemoveCheckpoint(podSandboxID)
return nil, fmt.Errorf("checkpoint is corrupted")
}
return &checkpoint, nil
}

View File

@ -30,7 +30,6 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/types"
@ -193,20 +192,10 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
// actions will only have sandbox ID and not have pod namespace and name information.
// Return error if encounter any unexpected error.
if checkpointErr != nil {
if libdocker.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
if libdocker.IsContainerNotFoundError(statusErr) {
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
"Proceed without further sandbox information.", podSandboxID)
} else {
if checkpointErr == errors.CorruptCheckpointError {
// Remove the corrupted checkpoint so that the next
// StopPodSandbox call can proceed. This may indicate that
// some resources won't be reclaimed.
// TODO (#43021): Fix this properly.
glog.Warningf("Removing corrupted checkpoint %q: %+v", podSandboxID, *checkpoint)
if err := ds.checkpointHandler.RemoveCheckpoint(podSandboxID); err != nil {
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", podSandboxID, err)
}
}
return utilerrors.NewAggregate([]error{
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
@ -488,13 +477,6 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
checkpoint, err := ds.checkpointHandler.GetCheckpoint(id)
if err != nil {
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)
if err == errors.CorruptCheckpointError {
glog.Warningf("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
if err := ds.checkpointHandler.RemoveCheckpoint(id); err != nil {
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", id, err)
}
}
continue
}
result = append(result, checkpointToRuntimeAPISandbox(id, checkpoint))

View File

@ -38,7 +38,6 @@ import (
kubecm "k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/dockershim/cm"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
"k8s.io/kubernetes/pkg/kubelet/network"
"k8s.io/kubernetes/pkg/kubelet/network/cni"
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
@ -366,10 +365,6 @@ func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.Po
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
// Return empty portMappings if checkpoint is not found
if err != nil {
if err == errors.CheckpointNotFoundError {
glog.Warningf("Failed to retrieve checkpoint for sandbox %q: %v", podSandboxID, err)
return nil, nil
}
return nil, err
}

View File

@ -1,25 +0,0 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = ["errors.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/dockershim/errors",
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)

View File

@ -1,22 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package errors
import "fmt"
var CorruptCheckpointError = fmt.Errorf("checkpoint is corrupted.")
var CheckpointNotFoundError = fmt.Errorf("checkpoint is not found.")

View File

@ -9,7 +9,6 @@ go_library(
name = "go_default_library",
srcs = ["util.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/dockershim/testing",
deps = ["//pkg/kubelet/dockershim/errors:go_default_library"],
)
filegroup(

View File

@ -17,9 +17,8 @@ limitations under the License.
package testing
import (
"fmt"
"sync"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
)
// MemStore is an implementation of CheckpointStore interface which stores checkpoint in memory.
@ -44,7 +43,7 @@ func (mstore *MemStore) Read(key string) ([]byte, error) {
defer mstore.Unlock()
data, ok := mstore.mem[key]
if !ok {
return nil, errors.CheckpointNotFoundError
return nil, fmt.Errorf("checkpoint is not found")
}
return data, nil
}