mirror of https://github.com/k3s-io/k3s
Merge pull request #55641 from sjenning/remove-corrupt-checkpoints
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. dockershim: remove corrupt checkpoints immediately upon detection Fixes https://github.com/kubernetes/kubernetes/issues/55620 The current checkpoint abstraction leaks the handling of corrupt checkpoints to the user. If the user does a `GetCheckpoint()` and the checkpoint is corrupt, the corrupt checkpoint is returned to the user (why??) with an error indicating the corruption. It is then up to the user to detect the corruption via the error msg and call `RemoveCheckpoint()` to remove the corrupted checkpoint. The checkpoint abstraction should not expose corruption to the user. If it is corrupt, it is as good as not found to the user. The checkpoint code should handle cleanup of corrupt entries and report "not found" to the user. @derekwaynecarr @eparis @dcbw @freehanpull/6/head
commit
b983cee8b8
|
@ -175,7 +175,6 @@ pkg/kubelet/container/testing
|
|||
pkg/kubelet/custommetrics
|
||||
pkg/kubelet/dockershim
|
||||
pkg/kubelet/dockershim/cm
|
||||
pkg/kubelet/dockershim/errors
|
||||
pkg/kubelet/dockershim/libdocker
|
||||
pkg/kubelet/dockershim/remote
|
||||
pkg/kubelet/dockershim/testing
|
||||
|
|
|
@ -47,7 +47,6 @@ go_library(
|
|||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/dockershim/cm:go_default_library",
|
||||
"//pkg/kubelet/dockershim/errors:go_default_library",
|
||||
"//pkg/kubelet/dockershim/libdocker:go_default_library",
|
||||
"//pkg/kubelet/dockershim/metrics:go_default_library",
|
||||
"//pkg/kubelet/leaky:go_default_library",
|
||||
|
@ -142,7 +141,6 @@ filegroup(
|
|||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/dockershim/cm:all-srcs",
|
||||
"//pkg/kubelet/dockershim/errors:all-srcs",
|
||||
"//pkg/kubelet/dockershim/libdocker:all-srcs",
|
||||
"//pkg/kubelet/dockershim/metrics:all-srcs",
|
||||
"//pkg/kubelet/dockershim/remote:all-srcs",
|
||||
|
|
|
@ -23,7 +23,6 @@ import (
|
|||
"path/filepath"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
|
||||
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
|
||||
hashutil "k8s.io/kubernetes/pkg/util/hash"
|
||||
|
@ -113,12 +112,14 @@ func (handler *PersistentCheckpointHandler) GetCheckpoint(podSandboxID string) (
|
|||
//TODO: unmarhsal into a struct with just Version, check version, unmarshal into versioned type.
|
||||
err = json.Unmarshal(blob, &checkpoint)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to unmarshal checkpoint %q. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
|
||||
return &checkpoint, errors.CorruptCheckpointError
|
||||
glog.Errorf("Failed to unmarshal checkpoint %q, removing checkpoint. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
|
||||
handler.RemoveCheckpoint(podSandboxID)
|
||||
return nil, fmt.Errorf("failed to unmarshal checkpoint")
|
||||
}
|
||||
if checkpoint.CheckSum != calculateChecksum(checkpoint) {
|
||||
glog.Errorf("Checksum of checkpoint %q is not valid", podSandboxID)
|
||||
return &checkpoint, errors.CorruptCheckpointError
|
||||
glog.Errorf("Checksum of checkpoint %q is not valid, removing checkpoint", podSandboxID)
|
||||
handler.RemoveCheckpoint(podSandboxID)
|
||||
return nil, fmt.Errorf("checkpoint is corrupted")
|
||||
}
|
||||
return &checkpoint, nil
|
||||
}
|
||||
|
|
|
@ -30,7 +30,6 @@ import (
|
|||
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||
|
@ -193,20 +192,10 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
|||
// actions will only have sandbox ID and not have pod namespace and name information.
|
||||
// Return error if encounter any unexpected error.
|
||||
if checkpointErr != nil {
|
||||
if libdocker.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
|
||||
if libdocker.IsContainerNotFoundError(statusErr) {
|
||||
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
|
||||
"Proceed without further sandbox information.", podSandboxID)
|
||||
} else {
|
||||
if checkpointErr == errors.CorruptCheckpointError {
|
||||
// Remove the corrupted checkpoint so that the next
|
||||
// StopPodSandbox call can proceed. This may indicate that
|
||||
// some resources won't be reclaimed.
|
||||
// TODO (#43021): Fix this properly.
|
||||
glog.Warningf("Removing corrupted checkpoint %q: %+v", podSandboxID, *checkpoint)
|
||||
if err := ds.checkpointHandler.RemoveCheckpoint(podSandboxID); err != nil {
|
||||
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", podSandboxID, err)
|
||||
}
|
||||
}
|
||||
return utilerrors.NewAggregate([]error{
|
||||
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
|
||||
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
|
||||
|
@ -488,13 +477,6 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
|
|||
checkpoint, err := ds.checkpointHandler.GetCheckpoint(id)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)
|
||||
|
||||
if err == errors.CorruptCheckpointError {
|
||||
glog.Warningf("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
|
||||
if err := ds.checkpointHandler.RemoveCheckpoint(id); err != nil {
|
||||
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", id, err)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
result = append(result, checkpointToRuntimeAPISandbox(id, checkpoint))
|
||||
|
|
|
@ -38,7 +38,6 @@ import (
|
|||
kubecm "k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network/cni"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
|
||||
|
@ -366,10 +365,6 @@ func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.Po
|
|||
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
||||
// Return empty portMappings if checkpoint is not found
|
||||
if err != nil {
|
||||
if err == errors.CheckpointNotFoundError {
|
||||
glog.Warningf("Failed to retrieve checkpoint for sandbox %q: %v", podSandboxID, err)
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = ["errors.go"],
|
||||
importpath = "k8s.io/kubernetes/pkg/kubelet/dockershim/errors",
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
|
@ -1,22 +0,0 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package errors
|
||||
|
||||
import "fmt"
|
||||
|
||||
var CorruptCheckpointError = fmt.Errorf("checkpoint is corrupted.")
|
||||
var CheckpointNotFoundError = fmt.Errorf("checkpoint is not found.")
|
|
@ -9,7 +9,6 @@ go_library(
|
|||
name = "go_default_library",
|
||||
srcs = ["util.go"],
|
||||
importpath = "k8s.io/kubernetes/pkg/kubelet/dockershim/testing",
|
||||
deps = ["//pkg/kubelet/dockershim/errors:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
|
|
|
@ -17,9 +17,8 @@ limitations under the License.
|
|||
package testing
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
)
|
||||
|
||||
// MemStore is an implementation of CheckpointStore interface which stores checkpoint in memory.
|
||||
|
@ -44,7 +43,7 @@ func (mstore *MemStore) Read(key string) ([]byte, error) {
|
|||
defer mstore.Unlock()
|
||||
data, ok := mstore.mem[key]
|
||||
if !ok {
|
||||
return nil, errors.CheckpointNotFoundError
|
||||
return nil, fmt.Errorf("checkpoint is not found")
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue